diff --git a/paddle/fluid/distributed/fleet.h b/paddle/fluid/distributed/fleet.h index be7fe8ea23fac1..3d2ba75d47327a 100644 --- a/paddle/fluid/distributed/fleet.h +++ b/paddle/fluid/distributed/fleet.h @@ -49,7 +49,7 @@ class PSCore; using framework::LoDTensor; using framework::Scope; -using framework::SelectedRows; +using pten::SelectedRows; using framework::Variable; using RpcCtxMap = std::unordered_map; diff --git a/paddle/fluid/distributed/service/brpc_utils.cc b/paddle/fluid/distributed/service/brpc_utils.cc index 147758abfd5553..bce3e46a2b0261 100644 --- a/paddle/fluid/distributed/service/brpc_utils.cc +++ b/paddle/fluid/distributed/service/brpc_utils.cc @@ -76,7 +76,7 @@ void SerializeToMultiVarMsgAndIOBuf( if (var->IsType()) { SerializeLodTensor(var, ctx, send_var_msg, &temp_iobuf); - } else if (var->IsType()) { + } else if (var->IsType()) { SerializeSelectedRows(var, ctx, send_var_msg, &temp_iobuf); } iobuf->append(temp_iobuf); @@ -127,7 +127,7 @@ void SerializeLodTensor(framework::Variable* var, void SerializeSelectedRows(framework::Variable* var, const platform::DeviceContext& ctx, VarMsg* var_msg, butil::IOBuf* iobuf) { - framework::SelectedRows* slr = var->GetMutable(); + pten::SelectedRows* slr = var->GetMutable(); auto* tensor = slr->mutable_value(); auto* rows = slr->mutable_rows(); @@ -255,7 +255,7 @@ void DeserializeSelectedRows( butil::IOBufBytesIterator& io_buffer_itr, // NOLINT const platform::DeviceContext& ctx) { const auto place = ctx.GetPlace(); - auto* slr = var->GetMutable(); + auto* slr = var->GetMutable(); framework::Tensor* tensor = slr->mutable_value(); slr->set_height(msg.slr_height()); std::vector tmp_rows(msg.dims()[0]); diff --git a/paddle/fluid/distributed/service/communicator.cc b/paddle/fluid/distributed/service/communicator.cc index e2b81ace291478..2e37442e02c69a 100644 --- a/paddle/fluid/distributed/service/communicator.cc +++ b/paddle/fluid/distributed/service/communicator.cc @@ -28,7 +28,7 @@ namespace paddle { namespace distributed { using framework::LoDTensor; -using framework::SelectedRows; +using pten::SelectedRows; const uint32_t MAX_FEASIGN_NUM = 1024 * 100 * 100; @@ -293,7 +293,7 @@ void Communicator::RpcSendSparse(const std::string &var_name, int table_id, std::vector push_g_vec; auto *send_var = scope.FindVar(var_name); - auto *tensor = send_var->GetMutable(); + auto *tensor = send_var->GetMutable(); auto dim = tensor->value().dims()[1]; std::transform(tensor->rows().begin(), tensor->rows().end(), std::back_inserter(sparse_push_keys), @@ -1012,10 +1012,10 @@ void GeoCommunicator::Send(const std::vector &var_names, auto *var = scope.FindVar(table_name); - PADDLE_ENFORCE_EQ(var->IsType(), true, + PADDLE_ENFORCE_EQ(var->IsType(), true, platform::errors::InvalidArgument( "Only need to send Sparse Grad in Geo mode.")); - auto &rows = var->Get().rows(); + auto &rows = var->Get().rows(); // insert ids which has not been record for (size_t j = 0; j < rows.size(); j++) { @@ -1290,7 +1290,7 @@ void GeoCommunicator::SendSparse(const std::string &varname, auto cpu_ctx = paddle::platform::CPUDeviceContext(); auto *var_delta = delta_scope_->Var(varname); - auto *t_delta = var_delta->GetMutable(); + auto *t_delta = var_delta->GetMutable(); auto *var_t_value = t_delta->mutable_value(); var_t_value->Resize({static_cast(sparse_ids.size()), dims1}); auto *t_value = var_t_value->mutable_data(cpu_ctx.GetPlace()); diff --git a/paddle/fluid/distributed/service/communicator.h b/paddle/fluid/distributed/service/communicator.h index 7056c9aba62dd5..8a905377974c07 100644 --- a/paddle/fluid/distributed/service/communicator.h +++ b/paddle/fluid/distributed/service/communicator.h @@ -193,15 +193,15 @@ inline void MergeVars(const std::string &var_name, result.device(*cpu_ctx.eigen_device()) = result / static_cast(vars.size()); } - } else if (var0->IsType()) { - auto &slr0 = var0->Get(); - auto *out_slr = out_var->GetMutable(); + } else if (var0->IsType()) { + auto &slr0 = var0->Get(); + auto *out_slr = out_var->GetMutable(); out_slr->mutable_rows()->clear(); out_slr->mutable_value()->mutable_data({{}}, cpu_place); - std::vector inputs; + std::vector inputs; inputs.reserve(vars.size()); for (auto &var : vars) { - inputs.push_back(&var->Get()); + inputs.push_back(&var->Get()); } auto dev_ctx = paddle::platform::CPUDeviceContext(); if (merge_add) { diff --git a/paddle/fluid/distributed/table/common_graph_table.h b/paddle/fluid/distributed/table/common_graph_table.h index b76ab0ae950602..bdc3bee2ea579b 100644 --- a/paddle/fluid/distributed/table/common_graph_table.h +++ b/paddle/fluid/distributed/table/common_graph_table.h @@ -39,8 +39,10 @@ #include "paddle/fluid/distributed/table/accessor.h" #include "paddle/fluid/distributed/table/common_table.h" #include "paddle/fluid/distributed/table/graph/graph_node.h" -#include "paddle/fluid/framework/rw_lock.h" #include "paddle/fluid/string/string_helper.h" + +#include "paddle/pten/core/utils/rw_lock.h" + namespace paddle { namespace distributed { class GraphShard { diff --git a/paddle/fluid/distributed/table/common_sparse_table.h b/paddle/fluid/distributed/table/common_sparse_table.h index a443710bf0fd82..1dc9ffba91cb3b 100644 --- a/paddle/fluid/distributed/table/common_sparse_table.h +++ b/paddle/fluid/distributed/table/common_sparse_table.h @@ -29,8 +29,8 @@ #include "paddle/fluid/distributed/table/depends/initializers.h" #include "paddle/fluid/distributed/table/depends/large_scale_kv.h" #include "paddle/fluid/distributed/table/depends/sparse.h" -#include "paddle/fluid/framework/rw_lock.h" #include "paddle/fluid/string/string_helper.h" +#include "paddle/pten/core/utils/rw_lock.h" #define PSERVER_SAVE_SUFFIX ".shard" @@ -110,7 +110,7 @@ struct Meta { class CommonSparseTable : public SparseTable { public: - CommonSparseTable() { rwlock_.reset(new framework::RWLock); } + CommonSparseTable() { rwlock_.reset(new pten::RWLock); } virtual ~CommonSparseTable() {} // unused method begin @@ -193,7 +193,7 @@ class CommonSparseTable : public SparseTable { std::shared_ptr optimizer_; std::vector> shard_values_; std::unordered_map> pull_reservoir_; - std::unique_ptr rwlock_{nullptr}; + std::unique_ptr rwlock_{nullptr}; }; } // namespace distributed diff --git a/paddle/fluid/distributed/table/depends/large_scale_kv.h b/paddle/fluid/distributed/table/depends/large_scale_kv.h index 3408ef5f91ad00..9a327f3f427755 100644 --- a/paddle/fluid/distributed/table/depends/large_scale_kv.h +++ b/paddle/fluid/distributed/table/depends/large_scale_kv.h @@ -32,7 +32,6 @@ #include "paddle/fluid/distributed/thirdparty/round_robin.h" #include "paddle/fluid/framework/generator.h" #include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/rw_lock.h" #include "paddle/fluid/framework/selected_rows_utils.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/threadpool.h" @@ -43,6 +42,7 @@ #include "paddle/fluid/string/printf.h" #include "paddle/fluid/string/string_helper.h" #include "paddle/pten/backends/dynload/port.h" +#include "paddle/pten/core/utils/rw_lock.h" namespace paddle { namespace distributed { diff --git a/paddle/fluid/distributed/table/sparse_geo_table.h b/paddle/fluid/distributed/table/sparse_geo_table.h index 4ddb1fd706069f..8031117f20c12f 100644 --- a/paddle/fluid/distributed/table/sparse_geo_table.h +++ b/paddle/fluid/distributed/table/sparse_geo_table.h @@ -31,8 +31,8 @@ #include "paddle/fluid/distributed/table/depends/initializers.h" #include "paddle/fluid/distributed/table/depends/large_scale_kv.h" #include "paddle/fluid/distributed/table/depends/sparse.h" -#include "paddle/fluid/framework/rw_lock.h" #include "paddle/fluid/string/string_helper.h" +#include "paddle/pten/core/utils/rw_lock.h" namespace paddle { namespace distributed { diff --git a/paddle/fluid/distributed/test/brpc_utils_test.cc b/paddle/fluid/distributed/test/brpc_utils_test.cc index 19198b4d207d15..e939df7681774b 100644 --- a/paddle/fluid/distributed/test/brpc_utils_test.cc +++ b/paddle/fluid/distributed/test/brpc_utils_test.cc @@ -56,7 +56,7 @@ void CreateVarsOnScope(framework::Scope* scope, platform::Place* place, // var 3 framework::Variable* var3 = scope->Var("x3"); - auto* slr = var3->GetMutable(); + auto* slr = var3->GetMutable(); slr->set_height(564); auto* tensor3 = slr->mutable_value(); auto* rows = slr->mutable_rows(); @@ -111,7 +111,7 @@ void RunMultiVarMsg(platform::Place place) { // check var3 framework::Variable* var3 = scope_recv.FindVar("x3"); - auto* slr = var3->GetMutable(); + auto* slr = var3->GetMutable(); EXPECT_EQ(slr->rows().size(), 564); for (int i = 0; i < 564; ++i) { EXPECT_EQ(slr->rows()[i], i); diff --git a/paddle/fluid/eager/legacy/infer_shape_context.h b/paddle/fluid/eager/legacy/infer_shape_context.h index 0979abc63d6587..b43eda7abc345b 100644 --- a/paddle/fluid/eager/legacy/infer_shape_context.h +++ b/paddle/fluid/eager/legacy/infer_shape_context.h @@ -197,9 +197,8 @@ class EagerInferShapeContext : public paddle::framework::InferShapeContext { out_var->GetMutable(); out_lod_tensor->Resize(in_lod_tensor.dims()); } else { - auto& in_sele_rows = in_var->Get(); - auto out_sele_rows = - out_var->GetMutable(); + auto& in_sele_rows = in_var->Get(); + auto out_sele_rows = out_var->GetMutable(); out_sele_rows->mutable_value()->Resize(in_sele_rows.value().dims()); out_sele_rows->set_rows(in_sele_rows.rows()); out_sele_rows->set_height(in_sele_rows.height()); @@ -368,8 +367,8 @@ class EagerInferShapeContext : public paddle::framework::InferShapeContext { "Input variable should not be null")); if (var->IsType()) { return var->Get().dims(); - } else if (var->IsType()) { - return var->Get().GetCompleteDims(); + } else if (var->IsType()) { + return var->Get().GetCompleteDims(); } else { PADDLE_THROW(paddle::platform::errors::PermissionDenied( "Only LoDTensor/SelectedRows support 'GetDim', but Variables " @@ -385,8 +384,8 @@ class EagerInferShapeContext : public paddle::framework::InferShapeContext { void SetDim(paddle::framework::Variable* var, const DDim& dim) { if (var->IsType()) { var->GetMutable()->Resize(dim); - } else if (var->IsType()) { - var->GetMutable()->set_height(dim[0]); + } else if (var->IsType()) { + var->GetMutable()->set_height(dim[0]); } else { PADDLE_THROW(paddle::platform::errors::PermissionDenied( "Variable type_id %s, expect LoDTensor/SelectedRows.")); diff --git a/paddle/fluid/eager/legacy/prepared_operator.cc b/paddle/fluid/eager/legacy/prepared_operator.cc index bd7e5c549872d3..3179b96807119e 100644 --- a/paddle/fluid/eager/legacy/prepared_operator.cc +++ b/paddle/fluid/eager/legacy/prepared_operator.cc @@ -32,8 +32,8 @@ const paddle::framework::Tensor* GetTensorFromVar( const paddle::framework::Variable& var) { if (var.IsType()) { return &(var.Get()); - } else if (var.IsType()) { - return &(var.Get().value()); + } else if (var.IsType()) { + return &(var.Get().value()); } else { return nullptr; } diff --git a/paddle/fluid/eager/legacy/tensor_helper.cc b/paddle/fluid/eager/legacy/tensor_helper.cc index 2ee2f9fefa9a34..fbf3205be2fe37 100644 --- a/paddle/fluid/eager/legacy/tensor_helper.cc +++ b/paddle/fluid/eager/legacy/tensor_helper.cc @@ -32,7 +32,7 @@ void InitializeVariable(paddle::framework::Variable *var, if (var_type == paddle::framework::proto::VarType::LOD_TENSOR) { var->GetMutable(); } else if (var_type == paddle::framework::proto::VarType::SELECTED_ROWS) { - var->GetMutable(); + var->GetMutable(); } else if (var_type == paddle::framework::proto::VarType::FEED_MINIBATCH) { var->GetMutable(); } else if (var_type == paddle::framework::proto::VarType::FETCH_LIST) { @@ -72,9 +72,9 @@ void CopyVariable(const paddle::framework::Variable &src_var, auto &src_tensor = src_var.Get(); tmp_grad_tensor->set_lod(src_tensor.lod()); paddle::framework::TensorCopy(src_tensor, cpu_place, tmp_grad_tensor); - } else if (src_var.IsType()) { - auto &src_slr = src_var.Get(); - auto *tmp_grad_slr = dst_var->GetMutable(); + } else if (src_var.IsType()) { + auto &src_slr = src_var.Get(); + auto *tmp_grad_slr = dst_var->GetMutable(); tmp_grad_slr->set_rows(src_slr.rows()); tmp_grad_slr->set_height(src_slr.height()); auto &src_t = src_slr.value(); @@ -89,8 +89,8 @@ paddle::framework::proto::VarType::Type GetDtypeFromVar( const paddle::framework::Variable &var) { if (var.IsType()) { return var.Get().type(); - } else if (var.IsType()) { - return var.Get().value().type(); + } else if (var.IsType()) { + return var.Get().value().type(); } else { PADDLE_THROW(paddle::platform::errors::InvalidArgument( "Variable type is %s, expect LoDTensor or SelectedRows.", @@ -101,8 +101,8 @@ const paddle::platform::Place &GetPlaceFromVar( const paddle::framework::Variable &var) { if (var.IsType()) { return var.Get().place(); - } else if (var.IsType()) { - return var.Get().place(); + } else if (var.IsType()) { + return var.Get().place(); } else { PADDLE_THROW(paddle::platform::errors::InvalidArgument( "Variable type is %s, expect LoDTensor or SelectedRows.", diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index c4f3f81537ae9f..ce63a58d41ae00 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -383,7 +383,7 @@ cc_library(prune SRCS prune.cc DEPS framework_proto boost) cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context) cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry proto_desc) -cc_library(selected_rows_utils SRCS selected_rows_utils.cc DEPS tensor) +cc_library(selected_rows_utils SRCS selected_rows_utils.cc DEPS selected_rows) cc_test(selected_rows_utils_test SRCS selected_rows_utils_test.cc DEPS selected_rows_utils) cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context framework_proto op_kernel_type) @@ -393,10 +393,6 @@ cc_test(tuple_test SRCS tuple_test.cc ) cc_test(inlined_vector_test SRCS inlined_vector_test.cc) -if (NOT WIN32) -cc_test(rw_lock_test SRCS rw_lock_test.cc) -endif (NOT WIN32) - cc_library(dlpack_tensor SRCS dlpack_tensor.cc DEPS tensor dlpack) cc_test(dlpack_tensor_test SRCS dlpack_tensor_test.cc DEPS dlpack_tensor glog) diff --git a/paddle/fluid/framework/data_transform.cc b/paddle/fluid/framework/data_transform.cc index d8c372becf1b45..22a2847c1d834f 100644 --- a/paddle/fluid/framework/data_transform.cc +++ b/paddle/fluid/framework/data_transform.cc @@ -120,9 +120,9 @@ void SetTensorToVariable(const Variable &in_var, const Tensor &tensor, tran_lod_tensor->set_format(in_lod_tensor.format()); #endif tran_lod_tensor->ShareDataWith(tensor); - } else if (in_var.IsType()) { - auto &in_selected_rows = in_var.Get(); - auto *trans_selected_rows = out_var->GetMutable(); + } else if (in_var.IsType()) { + auto &in_selected_rows = in_var.Get(); + auto *trans_selected_rows = out_var->GetMutable(); trans_selected_rows->set_height(in_selected_rows.height()); trans_selected_rows->set_rows(in_selected_rows.rows()); trans_selected_rows->mutable_value()->ShareDataWith(tensor); diff --git a/paddle/fluid/framework/details/broadcast_op_handle_test.h b/paddle/fluid/framework/details/broadcast_op_handle_test.h index 2e82fe22dba731..1435a82c0f528a 100644 --- a/paddle/fluid/framework/details/broadcast_op_handle_test.h +++ b/paddle/fluid/framework/details/broadcast_op_handle_test.h @@ -237,7 +237,7 @@ struct TestBroadcastOpHandle { PADDLE_ENFORCE_NOT_NULL( var, platform::errors::NotFound("Variable %s is not found in scope.", varname)); - auto selected_rows = var->GetMutable(); + auto selected_rows = var->GetMutable(); auto value = selected_rows->mutable_value(); value->mutable_data(kDims, place_list_[input_scope_idx]); selected_rows->set_height(height); @@ -256,7 +256,7 @@ struct TestBroadcastOpHandle { PADDLE_ENFORCE_NOT_NULL( var, platform::errors::NotFound("Variable %s is not found in scope.", varname)); - auto& selected_rows = var->Get(); + auto& selected_rows = var->Get(); auto rt = selected_rows.value(); PADDLE_ENFORCE_EQ(selected_rows.height(), height, platform::errors::InvalidArgument( diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.cc b/paddle/fluid/framework/details/eager_deletion_op_handle.cc index 59614e89c1344e..42b87f3853c58a 100644 --- a/paddle/fluid/framework/details/eager_deletion_op_handle.cc +++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc @@ -129,9 +129,10 @@ void EagerDeletionOpHandle::RunImpl() { if (var->IsType()) { garbages.emplace_back(var->GetMutable()->MoveMemoryHolder()); - } else if (var->IsType()) { - garbages.emplace_back( - var->GetMutable()->mutable_value()->MoveMemoryHolder()); + } else if (var->IsType()) { + garbages.emplace_back(var->GetMutable() + ->mutable_value() + ->MoveMemoryHolder()); } else if (var->IsType()) { auto *tensor_arr = var->GetMutable(); for (auto &t : *tensor_arr) { diff --git a/paddle/fluid/framework/details/gather_op_handle.cc b/paddle/fluid/framework/details/gather_op_handle.cc index 74f5deed45557c..430f55793b7360 100644 --- a/paddle/fluid/framework/details/gather_op_handle.cc +++ b/paddle/fluid/framework/details/gather_op_handle.cc @@ -64,14 +64,14 @@ void GatherOpHandle::RunImpl() { platform::errors::NotFound("The variable '%s' is not found in the scope.", in_0_handle->name())); - PADDLE_ENFORCE_EQ(pre_in_var->IsType(), true, + PADDLE_ENFORCE_EQ(pre_in_var->IsType(), true, platform::errors::Unimplemented( "Currently, gather_op only supports SelectedRows.")); // Wait input done, this Wait is asynchronous operation WaitInputVarGenerated(); - auto &pre_in_value = pre_in_var->Get(); + auto &pre_in_value = pre_in_var->Get(); std::vector out_rows; std::vector in_tensors; @@ -85,7 +85,7 @@ void GatherOpHandle::RunImpl() { "The variable '%s' is not found in the scope.", in_handle->name())); VariableVisitor::EnforceShapeAndDTypeEQ(*in_var, *pre_in_var); - auto &in_sr_value = in_var->Get(); + auto &in_sr_value = in_var->Get(); auto &in_sr_rows = in_sr_value.rows(); out_rows.insert(out_rows.end(), in_sr_rows.begin(), in_sr_rows.end()); @@ -108,7 +108,7 @@ void GatherOpHandle::RunImpl() { out_var, platform::errors::NotFound("The variable '%s' is not found in the scope.", out_var_handle->name())); - auto out_value = out_var->GetMutable(); + auto out_value = out_var->GetMutable(); out_value->set_height(pre_in_value.height()); out_value->set_rows(out_rows); size_t rows = out_rows.size(); diff --git a/paddle/fluid/framework/details/gather_op_handle_test.cc b/paddle/fluid/framework/details/gather_op_handle_test.cc index 38e20127f1612e..b46168bf8fb314 100644 --- a/paddle/fluid/framework/details/gather_op_handle_test.cc +++ b/paddle/fluid/framework/details/gather_op_handle_test.cc @@ -146,7 +146,7 @@ struct TestGatherOpHandle { PADDLE_ENFORCE_NOT_NULL( in_var, platform::errors::NotFound( "The variable '%s' is not found in the scope.", "input")); - auto in_selected_rows = in_var->GetMutable(); + auto in_selected_rows = in_var->GetMutable(); auto value = in_selected_rows->mutable_value(); value->mutable_data(kDims, gpu_list_[input_scope_idx]); @@ -162,10 +162,10 @@ struct TestGatherOpHandle { PADDLE_ENFORCE_NOT_NULL( out_var, platform::errors::NotFound( "The variable '%s' is not found in the scope.", "out")); - auto out_selected_rows = out_var->GetMutable(); + auto out_selected_rows = out_var->GetMutable(); auto in_var = param_scopes_.at(output_scope_idx)->FindVar("input"); - auto in_selected_rows = in_var->GetMutable(); + auto in_selected_rows = in_var->GetMutable(); out_selected_rows->mutable_value()->ShareDataWith( in_selected_rows->value()); @@ -177,7 +177,7 @@ struct TestGatherOpHandle { p::CPUPlace cpu_place; - auto& out_select_rows = out_var->Get(); + auto& out_select_rows = out_var->Get(); auto rt = out_select_rows.value(); PADDLE_ENFORCE_EQ(out_select_rows.height(), height, diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cc b/paddle/fluid/framework/details/nan_inf_utils_detail.cc index db3eaece3569f1..f57136e1f0ed94 100644 --- a/paddle/fluid/framework/details/nan_inf_utils_detail.cc +++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cc @@ -321,8 +321,8 @@ void CheckVarHasNanOrInf(const std::string& op_type, const Tensor* tensor{nullptr}; if (var->IsType()) { tensor = &var->Get(); - } else if (var->IsType()) { - tensor = &var->Get().value(); + } else if (var->IsType()) { + tensor = &var->Get().value(); } else { VLOG(10) << var_name << " var_name need not to check"; return; @@ -468,8 +468,8 @@ void PrintNpuVarInfo(const std::string& op_type, const std::string& var_name, const Tensor* tensor{nullptr}; if (var->IsType()) { tensor = &var->Get(); - } else if (var->IsType()) { - tensor = &var->Get().value(); + } else if (var->IsType()) { + tensor = &var->Get().value(); } else { VLOG(10) << var_name << " var_name need not to check"; return; diff --git a/paddle/fluid/framework/details/reduce_and_gather.h b/paddle/fluid/framework/details/reduce_and_gather.h index 583c34494bca4c..6d136055da7824 100644 --- a/paddle/fluid/framework/details/reduce_and_gather.h +++ b/paddle/fluid/framework/details/reduce_and_gather.h @@ -20,6 +20,11 @@ #include "paddle/fluid/framework/details/reduce_and_gather.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/selected_rows_utils.h" + +namespace pten { +class SelectedRows; +} // namespace pten + namespace paddle { namespace framework { namespace details { @@ -96,10 +101,10 @@ struct ReduceBufferData { struct GatherLocalSelectedRowsFunctor { GatherLocalSelectedRowsFunctor( - const std::vector &src_selected_rows, + const std::vector &src_selected_rows, const std::vector &in_places, const std::map &dev_ctxes, - const platform::Place &out_place, SelectedRows *dst_selected_rows) + const platform::Place &out_place, pten::SelectedRows *dst_selected_rows) : dev_ctxes_(dev_ctxes), in_places_(in_places), out_place_(out_place), @@ -147,7 +152,7 @@ struct GatherLocalSelectedRowsFunctor { std::vector in_tensors_; platform::Place out_place_; - SelectedRows *dst_selected_rows_; + pten::SelectedRows *dst_selected_rows_; }; } // namespace details diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc index 6493ef540ccbe0..5cf84a04958b82 100644 --- a/paddle/fluid/framework/details/reduce_op_handle.cc +++ b/paddle/fluid/framework/details/reduce_op_handle.cc @@ -114,10 +114,10 @@ void ReduceOpHandle::RunImpl() { t_out_p = platform::CPUPlace(); } - if (pre_in_var->IsType()) { + if (pre_in_var->IsType()) { this->RunAndRecordEvent([&] { - std::vector in_selected_rows = - GetInputValues(in_var_handles, var_scopes); + std::vector in_selected_rows = + GetInputValues(in_var_handles, var_scopes); const CollectiveContext &collective_context = *CollectiveContext::GetInstance(); @@ -130,7 +130,7 @@ void ReduceOpHandle::RunImpl() { platform::is_cpu_place(t_out_p)) { GatherLocalSelectedRowsFunctor functor( in_selected_rows, in_places, dev_ctxes_, t_out_p, - out_var->GetMutable()); + out_var->GetMutable()); WaitInputVarGenerated(); functor(); return; diff --git a/paddle/fluid/framework/details/reduce_op_handle.h b/paddle/fluid/framework/details/reduce_op_handle.h index e9c913b0c82550..5b1267d0970831 100644 --- a/paddle/fluid/framework/details/reduce_op_handle.h +++ b/paddle/fluid/framework/details/reduce_op_handle.h @@ -27,7 +27,6 @@ namespace paddle { namespace framework { -class SelectedRows; namespace details { struct VarHandle; @@ -131,11 +130,11 @@ struct ReduceOpHandle : public OpHandleBase { defined PADDLE_WITH_DISTRIBUTE template void GatherSelectedRows( - const std::vector &src_selecte_rows_, + const std::vector &src_selecte_rows_, const std::vector &in_places, const std::map &dev_ctxes, VarHandle *out_var_handle, const platform::Place &out_place, - SelectedRows *dst_selecte_rows); + pten::SelectedRows *dst_selecte_rows); #endif void Wait( diff --git a/paddle/fluid/framework/details/reduce_op_handle_test.cc b/paddle/fluid/framework/details/reduce_op_handle_test.cc index 35dba488454725..4931c64fdf83f7 100644 --- a/paddle/fluid/framework/details/reduce_op_handle_test.cc +++ b/paddle/fluid/framework/details/reduce_op_handle_test.cc @@ -174,7 +174,7 @@ struct TestReduceOpHandle { PADDLE_ENFORCE_NOT_NULL( in_var, platform::errors::NotFound( "Variable %s is not found in scope.", "input")); - auto in_selected_rows = in_var->GetMutable(); + auto in_selected_rows = in_var->GetMutable(); auto value = in_selected_rows->mutable_value(); value->mutable_data(kDims, gpu_list_[input_scope_idx]); @@ -190,10 +190,10 @@ struct TestReduceOpHandle { PADDLE_ENFORCE_NOT_NULL(out_var, platform::errors::NotFound( "Variable %s is not found in scope.", "out")); - auto out_selected_rows = out_var->GetMutable(); + auto out_selected_rows = out_var->GetMutable(); auto in_var = param_scopes_[output_scope_idx]->FindVar("input"); - auto in_selected_rows = in_var->GetMutable(); + auto in_selected_rows = in_var->GetMutable(); out_selected_rows->mutable_value()->ShareDataWith( in_selected_rows->value()); @@ -205,7 +205,7 @@ struct TestReduceOpHandle { p::CPUPlace cpu_place; - auto &out_select_rows = out_var->Get(); + auto &out_select_rows = out_var->Get(); auto rt = out_select_rows.value(); PADDLE_ENFORCE_EQ(out_select_rows.height(), height, diff --git a/paddle/fluid/framework/details/scope_buffered_monitor.cc b/paddle/fluid/framework/details/scope_buffered_monitor.cc index 7354824aae5996..2efe1c9555857f 100644 --- a/paddle/fluid/framework/details/scope_buffered_monitor.cc +++ b/paddle/fluid/framework/details/scope_buffered_monitor.cc @@ -33,9 +33,9 @@ static void GetTensors(Variable *var, std::unordered_set *tensor_set) { if (var->IsType() && var->Get().IsInitialized()) { tensor_set->insert(var->GetMutable()); - } else if (var->IsType() && - var->Get().value().IsInitialized()) { - tensor_set->insert(var->GetMutable()->mutable_value()); + } else if (var->IsType() && + var->Get().value().IsInitialized()) { + tensor_set->insert(var->GetMutable()->mutable_value()); } else if (var->IsType()) { auto *tensor_arr = var->GetMutable(); for (auto &t : *tensor_arr) { diff --git a/paddle/fluid/framework/details/variable_visitor.cc b/paddle/fluid/framework/details/variable_visitor.cc index 4315b6b0fc245a..9979d2ee205311 100644 --- a/paddle/fluid/framework/details/variable_visitor.cc +++ b/paddle/fluid/framework/details/variable_visitor.cc @@ -33,8 +33,8 @@ template static void VisitVariable(Variable* var, Func* func) { if (var->IsType()) { (*func)(var->GetMutable()); - } else if (var->IsType()) { - (*func)(var->GetMutable()); + } else if (var->IsType()) { + (*func)(var->GetMutable()); } else { PADDLE_THROW(platform::errors::Unimplemented( "VisitVariable is not supported for type %s.", @@ -46,8 +46,8 @@ template static void VisitVariable(const Variable& var, Func* func) { if (var.IsType()) { (*func)(var.Get()); - } else if (var.IsType()) { - (*func)(var.Get()); + } else if (var.IsType()) { + (*func)(var.Get()); } else { PADDLE_THROW(platform::errors::Unimplemented( "VisitVariable is not supported for type %s.", ToTypeName(var.Type()))); @@ -59,7 +59,7 @@ struct TensorVisitor { void operator()(LoDTensor* tensor) { result_ = tensor; } - void operator()(SelectedRows* selected_rows) { + void operator()(pten::SelectedRows* selected_rows) { result_ = selected_rows->mutable_value(); } @@ -85,8 +85,8 @@ struct ShareDimsAndLoDVisitor { tensor->Resize(val.dims()); } - void operator()(const SelectedRows& val) { - auto* selected_rows = trg_->GetMutable(); + void operator()(const pten::SelectedRows& val) { + auto* selected_rows = trg_->GetMutable(); selected_rows->set_rows(val.rows()); selected_rows->set_height(val.height()); selected_rows->mutable_value()->Resize(val.value().dims()); @@ -131,8 +131,8 @@ struct EnforceShapeAndDTypeEQVisitor { "The layout of the two variables' tensors tensor is not equal.")); } - void operator()(const SelectedRows& src) { - auto& selected_rows = dst_->Get(); + void operator()(const pten::SelectedRows& src) { + auto& selected_rows = dst_->Get(); PADDLE_ENFORCE_EQ( src.place().GetType(), selected_rows.place().GetType(), platform::errors::PreconditionNotMet( diff --git a/paddle/fluid/framework/downpour_worker.cc b/paddle/fluid/framework/downpour_worker.cc index 83d5a2efa342e5..bea23469f113a9 100644 --- a/paddle/fluid/framework/downpour_worker.cc +++ b/paddle/fluid/framework/downpour_worker.cc @@ -815,8 +815,8 @@ void DownpourWorker::TrainFiles() { if (var->IsType()) { tensor = var->GetMutable(); len = tensor->numel(); - } else if (var->IsType()) { - auto selected_rows = var->GetMutable(); + } else if (var->IsType()) { + auto selected_rows = var->GetMutable(); tensor = selected_rows->mutable_value(); len = tensor->numel(); } diff --git a/paddle/fluid/framework/executor_gc_helper.cc b/paddle/fluid/framework/executor_gc_helper.cc index 6e5578a2d12b4c..00d2149cb184b3 100644 --- a/paddle/fluid/framework/executor_gc_helper.cc +++ b/paddle/fluid/framework/executor_gc_helper.cc @@ -147,9 +147,10 @@ void DeleteUnusedTensors(const Scope &scope, VLOG(2) << "Erase variable " << var_name; if (var->IsType()) { garbages.emplace_back(var->GetMutable()->MoveMemoryHolder()); - } else if (var->IsType()) { - garbages.emplace_back( - var->GetMutable()->mutable_value()->MoveMemoryHolder()); + } else if (var->IsType()) { + garbages.emplace_back(var->GetMutable() + ->mutable_value() + ->MoveMemoryHolder()); } else if (var->IsType()) { auto *lod_tensor_arr = var->GetMutable(); for (auto &t : *lod_tensor_arr) { diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable.h b/paddle/fluid/framework/fleet/heter_ps/hashtable.h index 509b43431b5725..05f6ddda292bb4 100644 --- a/paddle/fluid/framework/fleet/heter_ps/hashtable.h +++ b/paddle/fluid/framework/fleet/heter_ps/hashtable.h @@ -23,7 +23,7 @@ limitations under the License. */ #ifdef PADDLE_WITH_PSCORE #include "paddle/fluid/distributed/table/depends/large_scale_kv.h" #endif -#include "paddle/fluid/framework/rw_lock.h" +#include "paddle/pten/core/utils/rw_lock.h" #include "thrust/pair.h" // #include "cudf/concurrent_unordered_map.cuh.h" #include "paddle/fluid/framework/fleet/heter_ps/cudf/concurrent_unordered_map.cuh.h" @@ -81,7 +81,7 @@ class HashTable { << " push value size: " << push_grad_value_size_; } - std::unique_ptr rwlock_{nullptr}; + std::unique_ptr rwlock_{nullptr}; private: TableContainer* container_; diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h b/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h index dec73574685585..72e628223e3178 100644 --- a/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h +++ b/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h @@ -121,7 +121,7 @@ __global__ void dy_mf_update_kernel(Table* table, template HashTable::HashTable(size_t capacity) { container_ = new TableContainer(capacity); - rwlock_.reset(new RWLock); + rwlock_.reset(new pten::RWLock); } template diff --git a/paddle/fluid/framework/multi_trainer.cc b/paddle/fluid/framework/multi_trainer.cc index 45087036b5d17d..de1a66057afa34 100644 --- a/paddle/fluid/framework/multi_trainer.cc +++ b/paddle/fluid/framework/multi_trainer.cc @@ -136,7 +136,7 @@ void MultiTrainer::InitTrainerEnv(const ProgramDesc& main_program, if (!root_var) { continue; } - if (root_var->IsType()) { + if (root_var->IsType()) { continue; } LoDTensor* root_tensor = root_var->GetMutable(); diff --git a/paddle/fluid/framework/new_executor/data_transfer.cc b/paddle/fluid/framework/new_executor/data_transfer.cc index 9230c36a0c7450..3fe9e877658dad 100644 --- a/paddle/fluid/framework/new_executor/data_transfer.cc +++ b/paddle/fluid/framework/new_executor/data_transfer.cc @@ -259,7 +259,7 @@ void ApplyDataTransform(const OpKernelType& expected_kernel_key, auto var = var_name_item.second[i]; auto& var_name = new_ins[var_name_item.first].at(i); const Tensor* tensor_in; - if (var->IsType() || var->IsType()) { + if (var->IsType() || var->IsType()) { tensor_in = GetLoDTensorOrSelectedRowsValueFromVar(*var); } else if (var->IsType()) { tensor_in = diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc index aea9ad20353966..f71a5b2c710cea 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.cc +++ b/paddle/fluid/framework/new_executor/interpretercore.cc @@ -676,8 +676,9 @@ void InterpreterCore::RecordStreamForGC(const Instruction& instr) { operators::reader:: OrderedMultiDeviceLoDTensorBlockingQueueHolder>()) { // do nothing - } else if (var->IsType()) { - TensorRecordStream(*(var->GetMutable()->mutable_value())); + } else if (var->IsType()) { + TensorRecordStream( + *(var->GetMutable()->mutable_value())); } else if (var->IsType()) { auto* tensor_arr = var->GetMutable(); for (auto& tensor : *tensor_arr) { diff --git a/paddle/fluid/framework/new_executor/interpretercore_event_garbage_collector.cc b/paddle/fluid/framework/new_executor/interpretercore_event_garbage_collector.cc index 7beefec4487de3..ba81ee9166fd65 100644 --- a/paddle/fluid/framework/new_executor/interpretercore_event_garbage_collector.cc +++ b/paddle/fluid/framework/new_executor/interpretercore_event_garbage_collector.cc @@ -76,10 +76,12 @@ void InterpreterCoreEventGarbageCollector::Add( } else if (var->IsType()) { // TODO(xiongkun03) in old executor, this type of variable is not support // eager deletion. so we just leave it here ? - } else if (var->IsType()) { - Add(var->GetMutable()->mutable_value()->MoveMemoryHolder(), + } else if (var->IsType()) { + Add(var->GetMutable() + ->mutable_value() + ->MoveMemoryHolder(), event, ctx); - var->GetMutable()->mutable_rows()->clear(); + var->GetMutable()->mutable_rows()->clear(); } else if (var->IsType()) { auto* tensor_arr = var->GetMutable(); for (auto& t : *tensor_arr) { @@ -132,4 +134,4 @@ void InterpreterCoreEventGarbageCollector::Free( } } // namespace framework -} // namespace paddle \ No newline at end of file +} // namespace paddle diff --git a/paddle/fluid/framework/new_executor/interpretercore_fast_garbage_collector.cc b/paddle/fluid/framework/new_executor/interpretercore_fast_garbage_collector.cc index 784cfca943ea1d..14fb8a9819b2dc 100644 --- a/paddle/fluid/framework/new_executor/interpretercore_fast_garbage_collector.cc +++ b/paddle/fluid/framework/new_executor/interpretercore_fast_garbage_collector.cc @@ -32,9 +32,11 @@ void InterpreterCoreFastGarbageCollector::Add(Variable* var) { } else if (var->IsType()) { // TODO(xiongkun03) in old executor, this type of variable is not support // eager deletion. so we just leave it here ? - } else if (var->IsType()) { - Add(var->GetMutable()->mutable_value()->MoveMemoryHolder()); - var->GetMutable()->mutable_rows()->clear(); + } else if (var->IsType()) { + Add(var->GetMutable() + ->mutable_value() + ->MoveMemoryHolder()); + var->GetMutable()->mutable_rows()->clear(); } else if (var->IsType()) { auto* tensor_arr = var->GetMutable(); for (auto& t : *tensor_arr) { diff --git a/paddle/fluid/framework/new_executor/interpretercore_util.cc b/paddle/fluid/framework/new_executor/interpretercore_util.cc index 214a1d728266b0..0371b12d009f3f 100644 --- a/paddle/fluid/framework/new_executor/interpretercore_util.cc +++ b/paddle/fluid/framework/new_executor/interpretercore_util.cc @@ -468,8 +468,8 @@ void build_op_func_list(const platform::Place& place, if (var->IsType()) { garbages->emplace_back( var->GetMutable()->MoveMemoryHolder()); - } else if (var->IsType()) { - garbages->emplace_back(var->GetMutable() + } else if (var->IsType()) { + garbages->emplace_back(var->GetMutable() ->mutable_value() ->MoveMemoryHolder()); } else if (var->IsType()) { diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.cc b/paddle/fluid/framework/new_executor/new_executor_defs.cc index fb29e18887b4ee..6c5e98489ef5a8 100644 --- a/paddle/fluid/framework/new_executor/new_executor_defs.cc +++ b/paddle/fluid/framework/new_executor/new_executor_defs.cc @@ -18,7 +18,7 @@ #include #include "paddle/fluid/framework/new_executor/new_executor_defs.h" -#include "paddle/fluid/framework/rw_lock.h" +#include "paddle/pten/core/utils/rw_lock.h" // When in inference scenario, the scopes will not be written by two threads in // a mean time, but a scope may be read by multiple threads concurrently, and @@ -171,9 +171,9 @@ void InterpretercoreInferShapeContext::ShareDim(const std::string& in, platform::errors::InvalidArgument( "The type of input (%s) and output (%s) are inconsistent.", in, out)); - if (in_var->IsType()) { - auto& in_sele_rows = in_var->Get(); - auto out_sele_rows = out_var->GetMutable(); + if (in_var->IsType()) { + auto& in_sele_rows = in_var->Get(); + auto out_sele_rows = out_var->GetMutable(); out_sele_rows->mutable_value()->Resize(in_sele_rows.value().dims()); out_sele_rows->set_rows(in_sele_rows.rows()); out_sele_rows->set_height(in_sele_rows.height()); @@ -392,8 +392,8 @@ DDim InterpretercoreInferShapeContext::GetDim(Variable* var) const { var, platform::errors::InvalidArgument("Input variable is nullptr.")); if (var->IsType()) { return var->Get().dims(); - } else if (var->IsType()) { - return var->Get().GetCompleteDims(); + } else if (var->IsType()) { + return var->Get().GetCompleteDims(); } else { PADDLE_THROW(platform::errors::InvalidArgument( "Only LoDTensor or SelectedRows support 'GetDim', but input " @@ -420,8 +420,8 @@ std::vector InterpretercoreInferShapeContext::GetRepeatedDims( void InterpretercoreInferShapeContext::SetDim(Variable* var, const DDim& dim) { if (var->IsType()) { var->GetMutable()->Resize(dim); - } else if (var->IsType()) { - var->GetMutable()->set_height(dim[0]); + } else if (var->IsType()) { + var->GetMutable()->set_height(dim[0]); } else { PADDLE_THROW(platform::errors::Unimplemented( "Variable type error, expect LoDTensor or SelectedRows, but received " diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.h b/paddle/fluid/framework/new_executor/new_executor_defs.h index 0ef85a25a237b5..b61b8af1e4a1b3 100644 --- a/paddle/fluid/framework/new_executor/new_executor_defs.h +++ b/paddle/fluid/framework/new_executor/new_executor_defs.h @@ -19,10 +19,10 @@ #include #include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/framework/rw_lock.h" #include "paddle/fluid/framework/variable_helper.h" #include "paddle/fluid/platform/device_event_base.h" #include "paddle/fluid/platform/event.h" +#include "paddle/pten/core/utils/rw_lock.h" // When in inference scenario, the scopes will not be written by two threads in // a mean time, but a scope may be read by multiple threads concurrently, and diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 1e0c42309d150b..ae61b7388d1b07 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -78,11 +78,11 @@ static DDim GetDimsDebug(const ScopeBase& scope, const std::string& name, if (var->IsType()) { const LoDTensor& tensor = var->Get(); return tensor.dims(); - } else if (var->IsType()) { + } else if (var->IsType()) { if (get_actual_dim) { - return var->Get().value().dims(); + return var->Get().value().dims(); } else { - return var->Get().GetCompleteDims(); + return var->Get().GetCompleteDims(); } } else if (var->IsType()) { return DDim({static_cast(var->Get().size())}); @@ -109,8 +109,8 @@ static std::string GetDtype(const ScopeBase& scope, const std::string& name) { return ""; } return DataTypeToString(tensor.type()); - } else if (var->IsType()) { - auto tensor = var->Get().value(); + } else if (var->IsType()) { + auto tensor = var->Get().value(); if (UNLIKELY(!tensor.IsInitialized())) { return "uninited"; } else { @@ -140,8 +140,8 @@ static std::string GetPlace(const ScopeBase& scope, const std::string& name) { return ""; } return to_string(tensor.place()); - } else if (var->IsType()) { - auto tensor = var->Get().value(); + } else if (var->IsType()) { + auto tensor = var->Get().value(); if (UNLIKELY(!tensor.IsInitialized())) { return "uninited"; } else { @@ -158,8 +158,8 @@ static int GetRowSize(const ScopeBase& scope, const std::string& name) { return -1; } - if (var->IsType()) { - return var->Get().rows().size(); + if (var->IsType()) { + return var->Get().rows().size(); } return -1; @@ -498,8 +498,8 @@ void OperatorBase::GenerateTemporaryNames() { const Tensor* GetLoDTensorOrSelectedRowsValueFromVar(const Variable& var) { if (var.IsType()) { return static_cast(&(var.Get())); - } else if (var.IsType()) { - return &(var.Get().value()); + } else if (var.IsType()) { + return &(var.Get().value()); } else { PADDLE_THROW(platform::errors::InvalidArgument( "Variable type is %s, expect LoDTensor or SelectedRows.", @@ -510,8 +510,8 @@ const Tensor* GetLoDTensorOrSelectedRowsValueFromVar(const Variable& var) { Tensor* GetMutableLoDTensorOrSelectedRowsValueFromVar(Variable* var) { if (var->IsType()) { return var->GetMutable(); - } else if (var->IsType()) { - return var->GetMutable()->mutable_value(); + } else if (var->IsType()) { + return var->GetMutable()->mutable_value(); } else { PADDLE_THROW(platform::errors::InvalidArgument( "Variable type is %s, expect LoDTensor or SelectedRows.", @@ -742,9 +742,9 @@ class RuntimeInferShapeContext : public InferShapeContext { "The type of input (%s) and output (%s) are inconsistent.", in, out)); - if (in_var->IsType()) { - auto& in_sele_rows = in_var->Get(); - auto out_sele_rows = out_var->GetMutable(); + if (in_var->IsType()) { + auto& in_sele_rows = in_var->Get(); + auto out_sele_rows = out_var->GetMutable(); out_sele_rows->mutable_value()->Resize(in_sele_rows.value().dims()); out_sele_rows->set_rows(in_sele_rows.rows()); out_sele_rows->set_height(in_sele_rows.height()); @@ -951,8 +951,8 @@ class RuntimeInferShapeContext : public InferShapeContext { var, platform::errors::InvalidArgument("Input variable is nullptr.")); if (var->IsType()) { return var->Get().dims(); - } else if (var->IsType()) { - return var->Get().GetCompleteDims(); + } else if (var->IsType()) { + return var->Get().GetCompleteDims(); } else { PADDLE_THROW(platform::errors::InvalidArgument( "Only LoDTensor or SelectedRows support 'GetDim', but input " @@ -977,8 +977,8 @@ class RuntimeInferShapeContext : public InferShapeContext { void SetDim(Variable* var, const DDim& dim) { if (var->IsType()) { var->GetMutable()->Resize(dim); - } else if (var->IsType()) { - var->GetMutable()->set_height(dim[0]); + } else if (var->IsType()) { + var->GetMutable()->set_height(dim[0]); } else { PADDLE_THROW(platform::errors::Unimplemented( "Variable type error, expect LoDTensor or SelectedRows, but received " @@ -1654,8 +1654,8 @@ void OperatorWithKernel::ParseInputDataType( t = &var->Get(); } else if (var->IsType()) { t = &var->Get(); - } else if (var->IsType()) { - t = &(var->Get().value()); + } else if (var->IsType()) { + t = &(var->Get().value()); } else if (var->IsType()) { auto t_arr = &var->Get(); for (size_t j = 0; j < t_arr->size(); j++) { @@ -1736,8 +1736,8 @@ Tensor* OperatorWithKernel::GetTensorFormInputSafely( t = var->GetMutable(); } else if (var->IsType()) { t = var->GetMutable(); - } else if (var->IsType()) { - t = var->GetMutable()->mutable_value(); + } else if (var->IsType()) { + t = var->GetMutable()->mutable_value(); } else { PADDLE_THROW(platform::errors::Unimplemented( "Unsupported input variable type in complex type promotion.")); diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index 0b71ee40a135c4..c280eeaa0fa571 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -118,7 +118,7 @@ inline std::string GradOriginalVarName(const std::string& grad_var_name) { } inline bool VarIsTensor(const Variable& var) { - return var.IsType() || var.IsType(); + return var.IsType() || var.IsType(); } const Tensor* GetLoDTensorOrSelectedRowsValueFromVar(const Variable& var); @@ -473,7 +473,7 @@ class ExecutionArgumentMappingContext : public pten::ArgumentMappingContext { } bool IsSelectedRowsInput(const std::string& name) const override { - return ctx_.InputVar(name)->IsType(); + return ctx_.InputVar(name)->IsType(); } private: diff --git a/paddle/fluid/framework/operator_test.cc b/paddle/fluid/framework/operator_test.cc index df7e3c4f6dde3b..ef6c41990cd6e2 100644 --- a/paddle/fluid/framework/operator_test.cc +++ b/paddle/fluid/framework/operator_test.cc @@ -456,7 +456,7 @@ TEST(IndicateVarDataTypeTest, selectedrows) { auto op = paddle::framework::OpRegistry::CreateOp(op_desc); auto* var = scope.Var("selected_rows_1"); - var->GetMutable(); + var->GetMutable(); bool caught = false; try { diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc index 54167d95899d6f..535c9ab58e295f 100644 --- a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc +++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc @@ -38,12 +38,12 @@ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/framework/rw_lock.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/inference/analysis/dot.h" #include "paddle/fluid/operators/cinn/cinn_launch_context.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/string/string_helper.h" +#include "paddle/pten/core/utils/rw_lock.h" namespace paddle { namespace framework { @@ -75,7 +75,7 @@ const CinnCompiledObject& CinnCompiler::Compile( bool exist = false; { - AutoRDLock r_guard{&rwlock_}; + pten::AutoRDLock r_guard{&rwlock_}; exist = cache_by_address_.count(cur_key_by_address) != 0; // if cannot find graph by address, checkout whether the graph structure // have been stored in cache. @@ -96,13 +96,13 @@ const CinnCompiledObject& CinnCompiler::Compile( std::int64_t compiled_num = real_compiled_num_.fetch_add(1); auto compiled_res = CompileGraph(graph, input_tensors, target, compiled_num, stream); - AutoWRLock w_guard{&rwlock_}; + pten::AutoWRLock w_guard{&rwlock_}; if (!cache_by_struct_.count(cur_key_by_struct)) { cache_by_address_[cur_key_by_address] = compiled_res.get(); cache_by_struct_[cur_key_by_struct] = std::move(compiled_res); } } - AutoRDLock guard{&rwlock_}; + pten::AutoRDLock guard{&rwlock_}; const auto& cached_boj = *cache_by_address_[cur_key_by_address]; return cached_boj; } @@ -198,7 +198,7 @@ std::string CinnCompiler::ReadableKey( void CinnCompiler::Clear() { { - AutoWRLock guard{&rwlock_}; + pten::AutoWRLock guard{&rwlock_}; graphs_.clear(); cache_by_address_.clear(); cache_by_struct_.clear(); diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler.h b/paddle/fluid/framework/paddle2cinn/cinn_compiler.h index 5070eb5ce5674d..024dd26747b8e7 100644 --- a/paddle/fluid/framework/paddle2cinn/cinn_compiler.h +++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler.h @@ -26,9 +26,9 @@ #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/paddle2cinn/cinn_cache_key.h" -#include "paddle/fluid/framework/rw_lock.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/platform/macros.h" +#include "paddle/pten/core/utils/rw_lock.h" namespace paddle { @@ -102,7 +102,7 @@ class CinnCompiler { std::unique_ptr, CinnCacheKey::Hash> cache_by_struct_; std::atomic_int64_t real_compiled_num_{0}; - mutable RWLock rwlock_; + mutable pten::RWLock rwlock_; DISABLE_COPY_AND_ASSIGN(CinnCompiler); }; diff --git a/paddle/fluid/framework/pten_utils.h b/paddle/fluid/framework/pten_utils.h index 7762476c60fb98..9b1019f6582377 100644 --- a/paddle/fluid/framework/pten_utils.h +++ b/paddle/fluid/framework/pten_utils.h @@ -68,5 +68,12 @@ struct ConvertToPtenContext { using TYPE = pten::CPUContext; }; +#ifdef PADDLE_WITH_XPU +template <> +struct ConvertToPtenContext { + using TYPE = pten::XPUContext; +}; +#endif + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/rw_lock.h b/paddle/fluid/framework/rw_lock.h deleted file mode 100644 index 9b74a55304077c..00000000000000 --- a/paddle/fluid/framework/rw_lock.h +++ /dev/null @@ -1,103 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#if !defined(_WIN32) -#include -#else -#include // NOLINT -#endif // !_WIN32 - -#include "paddle/fluid/platform/enforce.h" - -namespace paddle { -namespace framework { - -#if !defined(_WIN32) -struct RWLock { - RWLock() { pthread_rwlock_init(&lock_, nullptr); } - - ~RWLock() { pthread_rwlock_destroy(&lock_); } - - inline void RDLock() { - PADDLE_ENFORCE_EQ( - pthread_rwlock_rdlock(&lock_), 0, - platform::errors::External("The pthread failed to acquire read lock.")); - } - - inline void WRLock() { - PADDLE_ENFORCE_EQ(pthread_rwlock_wrlock(&lock_), 0, - platform::errors::External( - "The pthread failed to acquire write lock.")); - } - - inline void UNLock() { - PADDLE_ENFORCE_EQ( - pthread_rwlock_unlock(&lock_), 0, - platform::errors::External("The pthread failed to unlock.")); - } - - private: - pthread_rwlock_t lock_; -}; -// TODO(paddle-dev): Support RWLock for WIN32 for correctness. -#else -// https://stackoverflow.com/questions/7125250/making-pthread-rwlock-wrlock-recursive -// In windows, rw_lock seems like a hack. Use empty object and do nothing. -struct RWLock { - // FIXME(minqiyang): use mutex here to do fake lock - inline void RDLock() { mutex_.lock(); } - - inline void WRLock() { mutex_.lock(); } - - inline void UNLock() { mutex_.unlock(); } - - private: - std::mutex mutex_; -}; -#endif - -class AutoWRLock { - public: - explicit AutoWRLock(RWLock* rw_lock) : lock_(rw_lock) { Lock(); } - - ~AutoWRLock() { UnLock(); } - - private: - inline void Lock() { lock_->WRLock(); } - - inline void UnLock() { lock_->UNLock(); } - - private: - RWLock* lock_; -}; - -class AutoRDLock { - public: - explicit AutoRDLock(RWLock* rw_lock) : lock_(rw_lock) { Lock(); } - - ~AutoRDLock() { UnLock(); } - - private: - inline void Lock() { lock_->RDLock(); } - - inline void UnLock() { lock_->UNLock(); } - - private: - RWLock* lock_; -}; - -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/rw_lock_test.cc b/paddle/fluid/framework/rw_lock_test.cc deleted file mode 100644 index d140e95a37d84f..00000000000000 --- a/paddle/fluid/framework/rw_lock_test.cc +++ /dev/null @@ -1,80 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/rw_lock.h" - -#include -#include // NOLINT - -namespace f = paddle::framework; - -void f1(f::RWLock *lock) { - lock->RDLock(); - lock->UNLock(); -} - -TEST(RWLOCK, read_read) { - f::RWLock lock; - lock.RDLock(); - std::thread t1(f1, &lock); - std::thread t2(f1, &lock); - t1.join(); - t2.join(); - lock.UNLock(); -} - -void f2(f::RWLock *lock, std::vector *result) { - lock->RDLock(); - ASSERT_EQ(result->size(), 0UL); - lock->UNLock(); -} - -void f3(f::RWLock *lock, std::vector *result) { - lock->WRLock(); - result->push_back(1); - lock->UNLock(); -} - -TEST(RWLOCK, read_write) { - f::RWLock lock; - std::vector result; - - lock.RDLock(); - std::thread t1(f2, &lock, &result); - t1.join(); - std::thread t2(f3, &lock, &result); - std::this_thread::sleep_for(std::chrono::seconds(1)); - ASSERT_EQ(result.size(), 0UL); - lock.UNLock(); - t2.join(); - ASSERT_EQ(result.size(), 1UL); -} - -void f4(f::RWLock *lock, std::vector *result) { - lock->RDLock(); - ASSERT_EQ(result->size(), 1UL); - lock->UNLock(); -} - -TEST(RWLOCK, write_read) { - f::RWLock lock; - std::vector result; - - lock.WRLock(); - std::thread t1(f4, &lock, &result); - std::this_thread::sleep_for(std::chrono::seconds(1)); - result.push_back(1); - lock.UNLock(); - t1.join(); -} diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc index b2062cc51206a9..e6a372a8e631f9 100644 --- a/paddle/fluid/framework/scope.cc +++ b/paddle/fluid/framework/scope.cc @@ -34,10 +34,10 @@ PADDLE_DEFINE_EXPORTED_bool( #define SCOPE_VARS_READER_LOCK #define SCOPE_VARS_WRITER_LOCK #else -#define SCOPE_KIDS_READER_LOCK AutoRDLock auto_lock(&kids_lock_); -#define SCOPE_KIDS_WRITER_LOCK AutoWRLock auto_lock(&kids_lock_); -#define SCOPE_VARS_READER_LOCK AutoRDLock auto_lock(&vars_lock_); -#define SCOPE_VARS_WRITER_LOCK AutoWRLock auto_lock(&vars_lock_); +#define SCOPE_KIDS_READER_LOCK pten::AutoRDLock auto_lock(&kids_lock_); +#define SCOPE_KIDS_WRITER_LOCK pten::AutoWRLock auto_lock(&kids_lock_); +#define SCOPE_VARS_READER_LOCK pten::AutoRDLock auto_lock(&vars_lock_); +#define SCOPE_VARS_WRITER_LOCK pten::AutoWRLock auto_lock(&vars_lock_); #endif namespace paddle { diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h index b963c28d597bbb..7eb6082ce15fea 100644 --- a/paddle/fluid/framework/scope.h +++ b/paddle/fluid/framework/scope.h @@ -26,9 +26,9 @@ extern "C" { #include #include -#include "paddle/fluid/framework/rw_lock.h" #include "paddle/fluid/framework/variable.h" #include "paddle/fluid/platform/macros.h" +#include "paddle/pten/core/utils/rw_lock.h" namespace paddle { namespace framework { @@ -194,8 +194,8 @@ class Scope : public ScopeBase { #ifndef PADDLE_ON_INFERENCE private: - mutable RWLock kids_lock_; - mutable RWLock vars_lock_; + mutable pten::RWLock kids_lock_; + mutable pten::RWLock vars_lock_; #endif }; diff --git a/paddle/fluid/framework/selected_rows_utils.cc b/paddle/fluid/framework/selected_rows_utils.cc index c33ee655c2a98b..a1bffcfce19f1a 100644 --- a/paddle/fluid/framework/selected_rows_utils.cc +++ b/paddle/fluid/framework/selected_rows_utils.cc @@ -17,73 +17,8 @@ limitations under the License. */ namespace paddle { namespace framework { -struct ReAllocateVisitor { - ReAllocateVisitor(const framework::DDim& dims, framework::Tensor* tensor) - : dims_(dims), tensor_(tensor) {} - - template - void operator()() const { - framework::Tensor cpu_tensor; - platform::CPUPlace cpu; - T* ptr = cpu_tensor.mutable_data(dims_, cpu); - const T* old_ptr = - tensor_->memory_size() == 0 ? nullptr : tensor_->data(); - if (old_ptr != nullptr) { - std::copy(old_ptr, old_ptr + tensor_->numel(), ptr); - } - tensor_->ShareDataWith(cpu_tensor); - } - - framework::DDim dims_; - framework::Tensor* tensor_; -}; - -struct TensorCopyVisitor { - TensorCopyVisitor(framework::Tensor* dst, int64_t dst_offset, - const framework::Tensor src, int64_t src_offset, - int64_t size) - : dst_(dst), - dst_offset_(dst_offset), - src_(src), - src_offset_(src_offset), - size_(size) {} - - template - void apply() const { - // TODO(Yancey1989): support other place - platform::CPUPlace cpu; - memory::Copy(cpu, dst_->mutable_data(cpu) + dst_offset_, cpu, - src_.data() + src_offset_, size_ * sizeof(T)); - } - - framework::Tensor* dst_; - int64_t dst_offset_; - framework::Tensor src_; - int64_t src_offset_; - int64_t size_; -}; - -struct TensorFillVisitor { - TensorFillVisitor(framework::Tensor* dst, int64_t dst_offset, int64_t size, - float value) - : dst_(dst), dst_offset_(dst_offset), size_(size) {} - - template - void apply() const { - // TODO(qiao): support other place - platform::CPUPlace cpu; - auto* tensor_data = dst_->mutable_data(cpu); - auto* start = tensor_data + dst_offset_; - auto* end = start + size_; - std::fill(start, end, static_cast(0.0)); - } - - framework::Tensor* dst_; - int64_t dst_offset_; - int64_t size_; -}; - -void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows, +void SerializeToStream(std::ostream& os, + const pten::SelectedRows& selected_rows, const platform::DeviceContext& dev_ctx) { { // the 1st field, uint32_t version constexpr uint32_t version = 0; @@ -107,7 +42,8 @@ void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows, TensorToStream(os, selected_rows.value(), dev_ctx); } -void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows) { +void SerializeToStream(std::ostream& os, + const pten::SelectedRows& selected_rows) { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); const platform::DeviceContext* dev_ctx; auto place = selected_rows.place(); @@ -115,14 +51,15 @@ void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows) { SerializeToStream(os, selected_rows, *dev_ctx); } -void DeserializeFromStream(std::istream& os, SelectedRows* selected_rows) { +void DeserializeFromStream(std::istream& os, + pten::SelectedRows* selected_rows) { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); const platform::DeviceContext* dev_ctx; dev_ctx = pool.Get(platform::CPUPlace()); DeserializeFromStream(os, selected_rows, *dev_ctx); } -void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows, +void DeserializeFromStream(std::istream& is, pten::SelectedRows* selected_rows, const platform::DeviceContext& dev_ctx) { { // the 1st field, unit32_t version for SelectedRows @@ -151,109 +88,5 @@ void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows, // the 4st field, tensor which contains the data TensorFromStream(is, selected_rows->mutable_value(), dev_ctx); } - -bool SelectedRows::HasKey(int64_t key) const { - return std::find(rows_.begin(), rows_.end(), key) == rows_.end() ? false - : true; -} - -int64_t SelectedRows::AutoGrownIndex(int64_t key, bool auto_grown, - bool is_test) { - if (is_test) { - auto iter = id_to_index_.find(key); - if (iter == id_to_index_.end()) { - return -1; - } else { - return iter->second; - } - } - - rwlock_->RDLock(); - auto iter = id_to_index_.find(key); - if (iter == id_to_index_.end()) { - rwlock_->UNLock(); - PADDLE_ENFORCE_EQ( - auto_grown, true, - platform::errors::NotFound("Input key(%lld) is not found.", key)); - rwlock_->WRLock(); - auto map_size = id_to_index_.size(); - auto vector_size = rows_.size(); - if (map_size != vector_size) { - rwlock_->UNLock(); - PADDLE_THROW(platform::errors::InvalidArgument( - "Row map size(%zu) should be equal to rows size(%zu).", map_size, - vector_size)); - } - auto write_iter = id_to_index_.find(key); - if (write_iter == id_to_index_.end()) { - int row_num = rows_.size(); - if (row_num == value_->dims()[0]) { - rwlock_->UNLock(); - PADDLE_THROW(platform::errors::InvalidArgument( - "Selected rows is full, then length exceed the length of first " - "dimension (%d).", - row_num)); - } - // key logic to put a key into id_to_index_ - rows_.push_back(key); - auto index = static_cast(rows_.size() - 1); - id_to_index_[key] = index; - rwlock_->UNLock(); - return index; - } else { - auto index = write_iter->second; - rwlock_->UNLock(); - return index; - } - } else { - auto index = iter->second; - rwlock_->UNLock(); - return index; - } -} - -void SelectedRows::SyncIndex() { - rwlock_->WRLock(); - id_to_index_.clear(); - for (size_t i = 0; i < rows_.size(); ++i) { - id_to_index_[rows_[i]] = i; - } - rwlock_->UNLock(); -} - -void SelectedRows::Get(const framework::Tensor& ids, framework::Tensor* value, - bool auto_grown, bool is_test) { - PADDLE_ENFORCE_EQ(value->IsInitialized(), true, - platform::errors::InvalidArgument( - "The value tensor is not initialized.")); - if (ids.numel() == 0) { - VLOG(3) << "keys is empty, please check data!"; - } else { - int64_t value_width = value_->numel() / value_->dims()[0]; - PADDLE_ENFORCE_EQ( - value_width, value->numel() / value->dims()[0], - platform::errors::InvalidArgument( - "Output tensor should have the same shape with table " - "except the first dimmension, excepted value width not counting " - "the first dimension is %d, actual value width is %d.", - value_width, value->numel() / value->dims()[0])); - for (int i = 0; i < ids.numel(); ++i) { - auto id = ids.data()[i]; - int64_t index = AutoGrownIndex(id, auto_grown, is_test); - if (index < 0) { - VLOG(5) << "id " << id << " not in the table, return 0"; - framework::VisitDataType( - value_->type(), - TensorFillVisitor(value, i * value_width, value_width, 0.0)); - } else { - framework::VisitDataType( - value_->type(), - TensorCopyVisitor(value, i * value_width, *value_.get(), - index * value_width, value_width)); - } - } - } -} - } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/selected_rows_utils.h b/paddle/fluid/framework/selected_rows_utils.h index 445f446ef2f4ae..e1b26f2bbafa3f 100644 --- a/paddle/fluid/framework/selected_rows_utils.h +++ b/paddle/fluid/framework/selected_rows_utils.h @@ -21,153 +21,28 @@ limitations under the License. */ #include #include -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/rw_lock.h" -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/memory/memcpy.h" -#include "paddle/fluid/platform/place.h" +#include "paddle/pten/core/selected_rows.h" + +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/platform/device_context.h" namespace paddle { namespace framework { - -class SelectedRows { - /* - * @brief We can use the SelectedRows structure to reproduce a sparse table. - * A sparse table is a key-value structure that the key is an `int64_t`, - * and the value is a Tensor which the first dimension is 0. - * You can use the following interface to operate the sparse table, and you - * can find - * some detail information from the comments of each interface: - * - * HasKey(key), whether the sparse table has the specified key. - * Set(key, value), set a key-value pair into the sparse table. - * Get(keys, value*), get value by given key list and apply it to the given - * value pointer - * with the specified offset. - * - */ - public: - SelectedRows(const std::vector& rows, const int64_t& height) - : rows_(rows), height_(height) { - value_.reset(new Tensor()); - rwlock_.reset(new RWLock); - } - - SelectedRows() { - height_ = 0; - value_.reset(new Tensor()); - rwlock_.reset(new RWLock); - } - - const platform::Place& place() const { return value_->place(); } - - const Tensor& value() const { return *value_; } - - Tensor* mutable_value() { return value_.get(); } - - int64_t height() const { return height_; } - - void set_height(int64_t height) { height_ = height; } - - const Vector& rows() const { return rows_; } - - Vector* mutable_rows() { return &rows_; } - - void set_rows(const Vector& rows) { rows_ = rows; } - - /* - * @brief Get the index of key in rows - * - * @return -1 if the key does not exists. - */ - int64_t Index(int64_t key) const { - auto it = std::find(rows_.begin(), rows_.end(), key); - if (it == rows_.end()) { - PADDLE_THROW(platform::errors::NotFound( - "Input id (%lld) is not in current rows table.", key)); - } - return static_cast(std::distance(rows_.begin(), it)); - } - - /* - * @brief whether has the specified key in the table. - * - * @return true if the key is exists. - */ - bool HasKey(int64_t key) const; - - /* - * @brief Get value by the key list. - * Note!!! this interface is only used when selected_rows is used as - * parameters - * for distribute lookup table. - * - * @return a list of pair which contains the non-exists key and the index in - * the value - */ - void Get(const framework::Tensor& ids, framework::Tensor* value, - bool auto_grown = false, bool is_test = false); - - /* - * @brief Get the index of the key from id_to_index_ map. If the key not - * exist, - * add the key into id_to_index_. - * - * Note!!! this interface is only used when selected_rows is used as - * parameters - * for distribute lookup table. - * - * @return index of the key. - */ - int64_t AutoGrownIndex(int64_t key, bool auto_grown, bool is_test = false); - - /* - * @brief Get the index of the key from id_to_index_ map. - */ - inline int64_t GetIndexFromId(int64_t key) const { - auto iter = id_to_index_.find(key); - if (iter == id_to_index_.end()) { - return -1; - } else { - return iter->second; - } - } - - void SyncIndex(); - /* - * @brief Get complete Dims before - */ - DDim GetCompleteDims() const { - std::vector dims = vectorize(value_->dims()); - dims[0] = height_; - return make_ddim(dims); - } - - private: - // Notice: rows can be duplicate. We can have {0, 4, 7, 0, 5, 7, 9} here. - // SelectedRows are simply concated when adding together. Until a - // SelectedRows add a Tensor, will the duplicate rows be handled. - Vector rows_; - std::unordered_map - id_to_index_; // should not be used when rows_ has duplicate member - std::unique_ptr value_{nullptr}; - int64_t height_; // height indicates the underline tensor's height - std::unique_ptr rwlock_{nullptr}; -}; - /* * Serialize/Desiralize SelectedRows to std::ostream * You can pass ofstream or ostringstream to serilize to file * or to a in memory string. GPU tensor will be copied to CPU. */ -void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows, +void SerializeToStream(std::ostream& os, + const pten::SelectedRows& selected_rows, const platform::DeviceContext& dev_ctx); -void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows, +void DeserializeFromStream(std::istream& is, pten::SelectedRows* selected_rows, const platform::DeviceContext& dev_ctx); -void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows); +void SerializeToStream(std::ostream& os, + const pten::SelectedRows& selected_rows); -void DeserializeFromStream(std::istream& os, SelectedRows* selected_rows); +void DeserializeFromStream(std::istream& os, pten::SelectedRows* selected_rows); } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/selected_rows_utils_test.cc b/paddle/fluid/framework/selected_rows_utils_test.cc index 7a9f86041d996e..9a14f4395d9a19 100644 --- a/paddle/fluid/framework/selected_rows_utils_test.cc +++ b/paddle/fluid/framework/selected_rows_utils_test.cc @@ -24,7 +24,7 @@ class SelectedRowsTester : public ::testing::Test { std::vector rows{0, 4, 7}; int64_t height = 10; int64_t row_numel = 100; - selected_rows_.reset(new SelectedRows(rows, height)); + selected_rows_.reset(new pten::SelectedRows(rows, height)); Tensor* value = selected_rows_->mutable_value(); auto* data = value->mutable_data( @@ -36,7 +36,7 @@ class SelectedRowsTester : public ::testing::Test { protected: platform::CPUPlace place_; - std::unique_ptr selected_rows_{nullptr}; + std::unique_ptr selected_rows_{nullptr}; }; TEST_F(SelectedRowsTester, height) { ASSERT_EQ(selected_rows_->height(), 10); } @@ -50,7 +50,7 @@ TEST_F(SelectedRowsTester, complete_dims) { } TEST_F(SelectedRowsTester, SerializeAndDeseralize) { - SelectedRows dst_tensor; + pten::SelectedRows dst_tensor; platform::CPUDeviceContext cpu_ctx(place_); std::ostringstream oss; @@ -71,7 +71,7 @@ TEST_F(SelectedRowsTester, SerializeAndDeseralize) { TEST(SelectedRows, SparseTable) { platform::CPUPlace cpu; - SelectedRows table; + pten::SelectedRows table; int64_t table_size = 100; int64_t embedding_width = 8; @@ -124,7 +124,7 @@ TEST(SelectedRows, SparseTable) { } } -void f1(SelectedRows* table, int table_size) { +void f1(pten::SelectedRows* table, int table_size) { for (int i = 1000000; i > 0; --i) { auto id = i % table_size; int64_t index1 = table->AutoGrownIndex(id, true); @@ -135,7 +135,7 @@ void f1(SelectedRows* table, int table_size) { } } -void f2(SelectedRows* table, int table_size) { +void f2(pten::SelectedRows* table, int table_size) { for (int i = 0; i < 1000000; ++i) { auto id = i % table_size; int64_t index1 = table->AutoGrownIndex(id, true); @@ -146,7 +146,7 @@ void f2(SelectedRows* table, int table_size) { } } -void f3(SelectedRows* table, int table_size) { +void f3(pten::SelectedRows* table, int table_size) { clock_t t1 = clock(); for (int i = 100000; i > 0; --i) { auto id1 = table->AutoGrownIndex(i % table_size, true); @@ -157,7 +157,7 @@ void f3(SelectedRows* table, int table_size) { std::cout << "f3 run time:" << t2 - t1 << std::endl; } -void f4(SelectedRows* table, int table_size) { +void f4(pten::SelectedRows* table, int table_size) { clock_t t1 = clock(); for (int i = 0; i < 100000; ++i) { auto id1 = table->AutoGrownIndex(i % table_size, true); @@ -170,7 +170,7 @@ void f4(SelectedRows* table, int table_size) { TEST(SelectedRows, MultiThreadAutoIndex) { platform::CPUPlace cpu; - SelectedRows table; + pten::SelectedRows table; int64_t table_size = 100000; int64_t embedding_width = 8; diff --git a/paddle/fluid/framework/var_type.h b/paddle/fluid/framework/var_type.h index 5747df57c45685..dd1e329ac03231 100644 --- a/paddle/fluid/framework/var_type.h +++ b/paddle/fluid/framework/var_type.h @@ -57,7 +57,7 @@ inline void VisitVarType(const framework::Variable& var, Visitor visitor) { visitor(var.Get()); return; case proto::VarType::SELECTED_ROWS: - visitor(var.Get()); + visitor(var.Get()); return; case proto::VarType::READER: visitor(var.Get()); diff --git a/paddle/fluid/framework/var_type_traits.h b/paddle/fluid/framework/var_type_traits.h index 008b6829f9fe37..ac55abaad8d0a7 100644 --- a/paddle/fluid/framework/var_type_traits.h +++ b/paddle/fluid/framework/var_type_traits.h @@ -49,6 +49,7 @@ namespace pten { class DenseTensor; +class SelectedRows; } // namespace pten // Users should add forward declarations here @@ -76,7 +77,6 @@ class LoDRankTable; class ScopeBase; class ReaderHolder; class Scope; -class SelectedRows; } // namespace framework namespace operators { @@ -166,7 +166,7 @@ struct VarTypeRegistryImpl { // Users should add other variable types below. // Paddle would generate unique Ids for each registered variable types. using VarTypeRegistry = detail::VarTypeRegistryImpl< - Tensor, SelectedRows, std::vector, LoDRankTable, Strings, + Tensor, pten::SelectedRows, std::vector, LoDRankTable, Strings, LoDTensorArray, platform::PlaceList, ReaderHolder, String, Scope *, operators::reader::LoDTensorBlockingQueueHolder, FetchList, FeedList, operators::reader::OrderedMultiDeviceLoDTensorBlockingQueueHolder, @@ -206,7 +206,7 @@ struct VarTypeTrait { // Users should set some of variable type ids to be what is defined in // framework.proto below REG_PROTO_VAR_TYPE_TRAIT(LoDTensor, proto::VarType::LOD_TENSOR); -REG_PROTO_VAR_TYPE_TRAIT(SelectedRows, proto::VarType::SELECTED_ROWS); +REG_PROTO_VAR_TYPE_TRAIT(pten::SelectedRows, proto::VarType::SELECTED_ROWS); REG_PROTO_VAR_TYPE_TRAIT(std::vector, proto::VarType::STEP_SCOPES); REG_PROTO_VAR_TYPE_TRAIT(LoDRankTable, proto::VarType::LOD_RANK_TABLE); REG_PROTO_VAR_TYPE_TRAIT(LoDTensorArray, proto::VarType::LOD_TENSOR_ARRAY); diff --git a/paddle/fluid/framework/var_type_traits_test.cc b/paddle/fluid/framework/var_type_traits_test.cc index 812a34112a465a..bc418363bf737d 100644 --- a/paddle/fluid/framework/var_type_traits_test.cc +++ b/paddle/fluid/framework/var_type_traits_test.cc @@ -92,7 +92,7 @@ bool CheckVarId(int proto_id) { TEST(var_type_traits, check_proto_type_id) { ASSERT_TRUE(CheckVarId(proto::VarType::LOD_TENSOR)); - ASSERT_TRUE(CheckVarId(proto::VarType::SELECTED_ROWS)); + ASSERT_TRUE(CheckVarId(proto::VarType::SELECTED_ROWS)); ASSERT_TRUE(CheckVarId>(proto::VarType::STEP_SCOPES)); ASSERT_TRUE(CheckVarId(proto::VarType::LOD_RANK_TABLE)); ASSERT_TRUE(CheckVarId(proto::VarType::LOD_TENSOR_ARRAY)); diff --git a/paddle/fluid/framework/variable.h b/paddle/fluid/framework/variable.h index 188b00d818de3d..182ddafe3ea3da 100644 --- a/paddle/fluid/framework/variable.h +++ b/paddle/fluid/framework/variable.h @@ -123,8 +123,8 @@ inline pten::TensorInplaceVersion* Variable::InplaceVersionCounter() { version_counter_ptr = &GetMutable()->InplaceVersionCounter(); - } else if (IsType()) { - version_counter_ptr = &GetMutable() + } else if (IsType()) { + version_counter_ptr = &GetMutable() ->mutable_value() ->InplaceVersionCounter(); } else { diff --git a/paddle/fluid/framework/variable_helper.cc b/paddle/fluid/framework/variable_helper.cc index 34ab07def54c18..3c71987303bd40 100644 --- a/paddle/fluid/framework/variable_helper.cc +++ b/paddle/fluid/framework/variable_helper.cc @@ -31,7 +31,7 @@ void InitializeVariable(Variable *var, proto::VarType::Type var_type) { if (var_type == proto::VarType::LOD_TENSOR) { var->GetMutable(); } else if (var_type == proto::VarType::SELECTED_ROWS) { - var->GetMutable(); + var->GetMutable(); } else if (var_type == proto::VarType::FEED_MINIBATCH) { var->GetMutable(); } else if (var_type == proto::VarType::FETCH_LIST) { @@ -70,9 +70,9 @@ void CopyVariable(const Variable &src_var, Variable *dst_var) { auto &src_tensor = src_var.Get(); tmp_grad_tensor->set_lod(src_tensor.lod()); framework::TensorCopy(src_tensor, cpu_place, tmp_grad_tensor); - } else if (src_var.IsType()) { - auto &src_slr = src_var.Get(); - auto *tmp_grad_slr = dst_var->GetMutable(); + } else if (src_var.IsType()) { + auto &src_slr = src_var.Get(); + auto *tmp_grad_slr = dst_var->GetMutable(); tmp_grad_slr->set_rows(src_slr.rows()); tmp_grad_slr->set_height(src_slr.height()); auto &src_t = src_slr.value(); diff --git a/paddle/fluid/imperative/all_reduce.cc b/paddle/fluid/imperative/all_reduce.cc index d1d6a0f5adf581..0f105ec9a30823 100644 --- a/paddle/fluid/imperative/all_reduce.cc +++ b/paddle/fluid/imperative/all_reduce.cc @@ -39,8 +39,8 @@ static const platform::Place &GetVarPlace(const framework::Variable &src) { if (src.IsType()) { return src.Get().place(); #if NCCL_VERSION_CODE >= 2212 - } else if (src.IsType()) { - return src.Get().value().place(); + } else if (src.IsType()) { + return src.Get().value().place(); #endif } else { PADDLE_THROW(platform::errors::InvalidArgument( @@ -70,8 +70,7 @@ static void AllReduce(const framework::Tensor &src, framework::Tensor *dst, } #if NCCL_VERSION_CODE >= 2212 -static void AllReduce(const framework::SelectedRows &src, - framework::SelectedRows *dst, +static void AllReduce(const pten::SelectedRows &src, pten::SelectedRows *dst, const ParallelStrategy &strategy, const gpuStream_t stream, const platform::NCCLComm *comm) { @@ -191,19 +190,18 @@ void AllReduce(const framework::Variable &src, framework::Variable *dst, AllReduce(src.Get(), dst->GetMutable(), stream, comm); #if NCCL_VERSION_CODE >= 2212 - } else if (src.IsType()) { + } else if (src.IsType()) { if (&src != dst) { - if (!dst->IsType()) { + if (!dst->IsType()) { dst->Clear(); } - AllReduce(src.Get(), - dst->GetMutable(), strategy, stream, - comm); + AllReduce(src.Get(), + dst->GetMutable(), strategy, stream, comm); } else { // SelectedRows cannot be allreduce in-place framework::Variable tmp_dst; - AllReduce(src.Get(), - tmp_dst.GetMutable(), strategy, stream, + AllReduce(src.Get(), + tmp_dst.GetMutable(), strategy, stream, comm); // stream must synchronize to ensure accuracy of the move operation platform::GpuStreamSync(stream); diff --git a/paddle/fluid/imperative/dygraph_grad_maker.h b/paddle/fluid/imperative/dygraph_grad_maker.h index c50018f8236037..e1931a3b0f2489 100644 --- a/paddle/fluid/imperative/dygraph_grad_maker.h +++ b/paddle/fluid/imperative/dygraph_grad_maker.h @@ -365,12 +365,12 @@ class TracedGradOp { var_wrapper->MutableVar()->CurrentInplaceVersion()) { return var_wrapper; } else if (var_wrapper->MutableVar()->IsType() || - var_wrapper->MutableVar()->IsType()) { + var_wrapper->MutableVar()->IsType()) { auto* tensor = var_wrapper->MutableVar()->IsType() ? var_wrapper->MutableVar()->GetMutable() : var_wrapper->MutableVar() - ->GetMutable() + ->GetMutable() ->mutable_value(); if (!tensor->IsInitialized()) { return var_wrapper; diff --git a/paddle/fluid/imperative/gloo_context.cc b/paddle/fluid/imperative/gloo_context.cc index 1eaf0c6538043f..44315e267ee78d 100644 --- a/paddle/fluid/imperative/gloo_context.cc +++ b/paddle/fluid/imperative/gloo_context.cc @@ -72,18 +72,18 @@ void GLOOParallelContext::AllReduceByStream(const framework::Variable &src, } AllReduce(src.Get(), dst->GetMutable()); - } else if (src.IsType()) { + } else if (src.IsType()) { if (&src != dst) { - if (!dst->IsType()) { + if (!dst->IsType()) { dst->Clear(); } - AllReduce(src.Get(), - dst->GetMutable()); + AllReduce(src.Get(), + dst->GetMutable()); } else { // SelectedRows cannot be allreduce in-place framework::Variable tmp_dst; - AllReduce(src.Get(), - tmp_dst.GetMutable()); + AllReduce(src.Get(), + tmp_dst.GetMutable()); *dst = std::move(tmp_dst); } } else { @@ -120,8 +120,8 @@ void GLOOParallelContext::AllReduce(const framework::Tensor &src_tensor, break; \ } -void GLOOParallelContext::AllReduce(const framework::SelectedRows &src, - framework::SelectedRows *dst) { +void GLOOParallelContext::AllReduce(const pten::SelectedRows &src, + pten::SelectedRows *dst) { // auto ; // int local_rank = strategy_.local_rank_; int nranks = strategy_.nranks_; diff --git a/paddle/fluid/imperative/gloo_context.h b/paddle/fluid/imperative/gloo_context.h index f13bb859eee936..d63d48eac7e02b 100644 --- a/paddle/fluid/imperative/gloo_context.h +++ b/paddle/fluid/imperative/gloo_context.h @@ -59,8 +59,7 @@ class GLOOParallelContext : public ParallelContext { private: void AllReduce(const framework::Tensor& src, framework::Tensor* dst); - void AllReduce(const framework::SelectedRows& src, - framework::SelectedRows* dst); + void AllReduce(const pten::SelectedRows& src, pten::SelectedRows* dst); private: std::unique_ptr device_; diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc index 092872247cca56..e075e11a08d3b7 100644 --- a/paddle/fluid/imperative/gradient_accumulator.cc +++ b/paddle/fluid/imperative/gradient_accumulator.cc @@ -55,12 +55,12 @@ static void MoveOrCopyVar(framework::Variable* dst, framework::Variable* src, auto* dst_tensor = dst->GetMutable(); framework::TensorCopy(src_tensor, src_tensor.place(), dst_tensor); dst_tensor->set_lod(src_tensor.lod()); - } else if (src->IsType()) { - auto& src_selected_rows = src->Get(); - if (!dst->IsType()) { + } else if (src->IsType()) { + auto& src_selected_rows = src->Get(); + if (!dst->IsType()) { dst->Clear(); } - auto* dst_selected_rows = dst->GetMutable(); + auto* dst_selected_rows = dst->GetMutable(); framework::TensorCopy(src_selected_rows.value(), src_selected_rows.value().place(), dst_selected_rows->mutable_value()); @@ -332,7 +332,7 @@ void TensorAdd(const framework::Variable& src, framework::Variable* dst) { void SelectedRowsAddToTensor(const framework::Variable& src, framework::Variable* dst) { auto* dst_tensor = dst->GetMutable(); - auto& src_selected_rows = src.Get(); + auto& src_selected_rows = src.Get(); auto place = dst_tensor->place(); auto data_type = src_selected_rows.value().type(); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); @@ -371,7 +371,7 @@ static void SelectedRowsAddTensor( const framework::Variable& src_tensor_var, framework::Variable* dst_tensor_var) { const auto& src_selected_rows = - src_selected_rows_var.Get(); + src_selected_rows_var.Get(); const auto& src_tensor = src_tensor_var.Get(); const auto& place = src_tensor.place(); auto data_type = src_tensor.type(); @@ -414,18 +414,18 @@ static void SelectedRowsAddTensor( // to one then add it to a empty selected rows, the after is correct std::shared_ptr SelectedRowsMerge( const framework::Variable& src1, const framework::Variable& src2) { - auto& src_selected_rows1 = src1.Get(); - auto& src_selected_rows2 = src2.Get(); + auto& src_selected_rows1 = src1.Get(); + auto& src_selected_rows2 = src2.Get(); auto place = src_selected_rows1.value().place(); auto data_type = src_selected_rows1.value().type(); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - std::vector src_selected_rows; + std::vector src_selected_rows; src_selected_rows.emplace_back(&src_selected_rows1); src_selected_rows.emplace_back(&src_selected_rows2); auto dst_var = std::make_shared("Temp"); auto* dst_selected_rows = - dst_var->MutableVar()->GetMutable(); + dst_var->MutableVar()->GetMutable(); #define PADDLE_SELECTED_ROWS_ADD(dev_ctx_type, cpp_type) \ if (data_type == framework::DataTypeTrait::DataType()) { \ @@ -463,7 +463,7 @@ void VariableWrapperAdd(std::shared_ptr var, if (dst->IsType()) { if (src.IsType()) { TensorAdd(src, dst); - } else if (src.IsType()) { + } else if (src.IsType()) { SelectedRowsAddToTensor(src, dst); } else { PADDLE_THROW(platform::errors::InvalidArgument( @@ -481,7 +481,7 @@ void VariableWrapperAdd(std::shared_ptr var, SelectedRowsAddToTensor(*dst, src_mutable); *dst = std::move(*(var->MutableVar())); } - } else if (src.IsType()) { + } else if (src.IsType()) { auto temp = SelectedRowsMerge(src, *dst); *dst = std::move(*(temp->MutableVar())); } else { @@ -497,8 +497,8 @@ static platform::Place GetPlaceOfVar( platform::Place place; if (var->Var().IsType()) { place = var->Var().Get().place(); - } else if (var->Var().IsType()) { - place = var->Var().Get().place(); + } else if (var->Var().IsType()) { + place = var->Var().Get().place(); } else { PADDLE_THROW(platform::errors::InvalidArgument( "only support LoDTensor and SelectedRows in dygraph")); @@ -530,14 +530,14 @@ void GradientAccumulator::AccumulateGrad() { if (dst->IsType()) { if (src->IsType()) { TensorAdd(*src, dst); - } else if (src->IsType()) { + } else if (src->IsType()) { SelectedRowsAddToTensor(*src, dst); } - } else if (dst->IsType()) { + } else if (dst->IsType()) { if (src->IsType()) { SelectedRowsAddToTensor(*dst, src); *dst = std::move(*src); - } else if (src->IsType()) { + } else if (src->IsType()) { auto temp = SelectedRowsMerge(*src, *dst); *dst = std::move(*(temp->MutableVar())); } @@ -657,7 +657,7 @@ void EagerGradientAccumulator::SumGrad(std::shared_ptr var, // so synchronous VariableWrapper with Variable. if (dst_var->Var().IsType()) { dst_var->SetType(framework::proto::VarType::LOD_TENSOR); - } else if (dst_var->Var().IsType()) { + } else if (dst_var->Var().IsType()) { dst_var->SetType(framework::proto::VarType::SELECTED_ROWS); } @@ -701,7 +701,7 @@ void SortedGradientAccumulator::SumGrad(std::shared_ptr var, if (paddle::platform::is_gpu_place(place)) { // sum selected rows firstly for (auto& var_info : tmp_grad_vars_) { - if (!var_info.var->Var().IsType()) { + if (!var_info.var->Var().IsType()) { continue; } @@ -744,7 +744,7 @@ void SortedGradientAccumulator::SumGrad(std::shared_ptr var, } PADDLE_ENFORCE_EQ( var_info.var->Var().IsType() || - var_info.var->Var().IsType(), + var_info.var->Var().IsType(), true, platform::errors::PermissionDenied("The type of Gradient " "var must be LoDTensor " "or SelectedRows")); @@ -789,7 +789,7 @@ void SortedGradientAccumulator::SumGrad(std::shared_ptr var, if (dst_var->Var().IsType()) { dst_var->SetType(framework::proto::VarType::LOD_TENSOR); - } else if (dst_var->Var().IsType()) { + } else if (dst_var->Var().IsType()) { dst_var->SetType(framework::proto::VarType::SELECTED_ROWS); } } diff --git a/paddle/fluid/imperative/gradient_accumulator.h b/paddle/fluid/imperative/gradient_accumulator.h index 6411dce4405c11..8896e5d0f40644 100644 --- a/paddle/fluid/imperative/gradient_accumulator.h +++ b/paddle/fluid/imperative/gradient_accumulator.h @@ -31,7 +31,7 @@ class GradientAccumulator { if (var && var->Var().IsInitialized()) { if (var->Var().IsType()) { var->SetType(framework::proto::VarType::LOD_TENSOR); - } else if (var->Var().IsType()) { + } else if (var->Var().IsType()) { var->SetType(framework::proto::VarType::SELECTED_ROWS); } else { PADDLE_THROW(platform::errors::PermissionDenied( diff --git a/paddle/fluid/imperative/infer_shape_context.h b/paddle/fluid/imperative/infer_shape_context.h index 71f7fb7387effe..a39e58bba90110 100644 --- a/paddle/fluid/imperative/infer_shape_context.h +++ b/paddle/fluid/imperative/infer_shape_context.h @@ -196,8 +196,8 @@ class DygraphInferShapeContext : public framework::InferShapeContext { auto* out_lod_tensor = out_var->GetMutable(); out_lod_tensor->Resize(in_lod_tensor.dims()); } else { - auto& in_sele_rows = in_var->Get(); - auto out_sele_rows = out_var->GetMutable(); + auto& in_sele_rows = in_var->Get(); + auto out_sele_rows = out_var->GetMutable(); out_sele_rows->mutable_value()->Resize(in_sele_rows.value().dims()); out_sele_rows->set_rows(in_sele_rows.rows()); out_sele_rows->set_height(in_sele_rows.height()); @@ -365,8 +365,8 @@ class DygraphInferShapeContext : public framework::InferShapeContext { "Input variable should not be null")); if (var->IsType()) { return var->Get().dims(); - } else if (var->IsType()) { - return var->Get().GetCompleteDims(); + } else if (var->IsType()) { + return var->Get().GetCompleteDims(); } else { PADDLE_THROW(platform::errors::PermissionDenied( "Only LoDTensor/SelectedRows support 'GetDim', but Variables " @@ -382,8 +382,8 @@ class DygraphInferShapeContext : public framework::InferShapeContext { void SetDim(framework::Variable* var, const DDim& dim) { if (var->IsType()) { var->GetMutable()->Resize(dim); - } else if (var->IsType()) { - var->GetMutable()->set_height(dim[0]); + } else if (var->IsType()) { + var->GetMutable()->set_height(dim[0]); } else { PADDLE_THROW(platform::errors::PermissionDenied( "Variable type_id %s, expect LoDTensor/SelectedRows.")); diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc index f47b024973ba78..65720c8a3cf657 100644 --- a/paddle/fluid/imperative/layer.cc +++ b/paddle/fluid/imperative/layer.cc @@ -105,9 +105,9 @@ static std::string DebugString( ss << "NOT_INITED"; } ss << ">"; - } else if (var.IsType()) { + } else if (var.IsType()) { ss << "SelectedRows<"; - auto& selected_rows = var.Get(); + auto& selected_rows = var.Get(); auto& tensor = selected_rows.value(); auto& rows = selected_rows.rows(); if (tensor.IsInitialized()) { @@ -188,9 +188,8 @@ size_t VarBase::GradOpNum() const { void VarBase::ClearGradient(bool set_to_zero) { VLOG(4) << "ClearGradient " << Name(); if (grad_var_) { - if (grad_var_->Var().IsType()) { - auto* grad_t = - grad_var_->MutableVar()->GetMutable(); + if (grad_var_->Var().IsType()) { + auto* grad_t = grad_var_->MutableVar()->GetMutable(); if (grad_t->mutable_value()->IsInitialized()) { #ifdef PADDLE_WITH_MKLDNN if (FLAGS_use_mkldnn) platform::ClearMKLDNNCache(grad_t->place()); @@ -248,7 +247,7 @@ std::shared_ptr VarBase::NewVarBase(const platform::Place& dst_place, const bool blocking) const { PADDLE_ENFORCE_EQ( Var().IsInitialized() && (Var().IsType() || - Var().IsType()), + Var().IsType()), true, platform::errors::InvalidArgument( "Variable is not initialized or Variable's type is not " "LoDTensor or SelectedRows when getting numpy tensor")); @@ -277,12 +276,12 @@ std::shared_ptr VarBase::NewVarBase(const platform::Place& dst_place, << dst_place; return new_var; } else { - auto& src_selected_rows = Var().Get(); + auto& src_selected_rows = Var().Get(); auto new_var = std::make_shared( false, "Itmp" + std::to_string(copied_counter_++)); new_var->SetType(framework::proto::VarType::SELECTED_ROWS); auto* dst_selected_rows = - new_var->MutableVar()->GetMutable(); + new_var->MutableVar()->GetMutable(); framework::TensorCopy(src_selected_rows.value(), dst_place, dst_selected_rows->mutable_value()); @@ -346,10 +345,9 @@ void VarBase::CopyFrom(const VarBase& src, const bool blocking) { dst_tensor->Resize(src_tensor.dims()); } framework::TensorCopy(src_tensor, place, dst_tensor); - } else if (src.Var().IsType()) { - auto& src_selected_rows = src.Var().Get(); - auto* dst_selected_rows = - MutableVar()->GetMutable(); + } else if (src.Var().IsType()) { + auto& src_selected_rows = src.Var().Get(); + auto* dst_selected_rows = MutableVar()->GetMutable(); dst_selected_rows->set_height(src_selected_rows.height()); dst_selected_rows->set_rows(src_selected_rows.rows()); diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc index 5f144c08d21e3b..2caa5319094fe9 100644 --- a/paddle/fluid/imperative/prepared_operator.cc +++ b/paddle/fluid/imperative/prepared_operator.cc @@ -47,8 +47,8 @@ const std::shared_ptr& GetVariableWrapper( const framework::Tensor* GetTensorFromVar(const framework::Variable& var) { if (var.IsType()) { return &(var.Get()); - } else if (var.IsType()) { - return &(var.Get().value()); + } else if (var.IsType()) { + return &(var.Get().value()); } else { return nullptr; } diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc index ad518eb96062d2..54e27b2bd8c313 100644 --- a/paddle/fluid/imperative/reducer.cc +++ b/paddle/fluid/imperative/reducer.cc @@ -36,8 +36,7 @@ namespace imperative { void Group::DivNRanks(const platform::DeviceContext &context, int64_t nranks) { framework::Tensor *tensor = is_sparse_ - ? sparse_contents_->GetMutable() - ->mutable_value() + ? sparse_contents_->GetMutable()->mutable_value() : dense_contents_.GetMutable(); if (platform::is_gpu_place(tensor->place())) { @@ -775,7 +774,7 @@ void Reducer::MarkVarReady(const size_t var_index, const bool is_used_var) { auto var_base = vars_[var_index]->GradVarBase(); // need to check tensor type PADDLE_ENFORCE_EQ( - var_base->Var().IsType(), true, + var_base->Var().IsType(), true, platform::errors::PreconditionNotMet( "The sparse parameter[%d][%s] must have a selectedrows gradient. " "Before forward pass, the parameter type is inferred to be " @@ -995,8 +994,8 @@ bool Reducer::HasGrad(size_t var_index) { if (var.Get().IsInitialized()) { return true; } - } else if (var.IsType()) { - if (var.Get().value().IsInitialized()) { + } else if (var.IsType()) { + if (var.Get().value().IsInitialized()) { return true; } } else { diff --git a/paddle/fluid/imperative/tests/test_gradient_accmulator.cc b/paddle/fluid/imperative/tests/test_gradient_accmulator.cc index 0a7df9953ad45a..6cf5585ba83e3d 100644 --- a/paddle/fluid/imperative/tests/test_gradient_accmulator.cc +++ b/paddle/fluid/imperative/tests/test_gradient_accmulator.cc @@ -124,8 +124,8 @@ static void CopyVar(const framework::Variable& var, auto* dst_tensor = dst.GetMutable(); framework::TensorCopySync(src_tensor, src_tensor.place(), dst_tensor); } else { - const auto& src_selected_rows = var.Get(); - auto* dst_selected_rows = dst.GetMutable(); + const auto& src_selected_rows = var.Get(); + auto* dst_selected_rows = dst.GetMutable(); dst_selected_rows->set_rows(src_selected_rows.rows()); dst_selected_rows->set_height(src_selected_rows.height()); framework::TensorCopySync(src_selected_rows.value(), @@ -148,8 +148,8 @@ static bool IsEqualVar(const framework::Variable& var1, framework::TensorCopySync(var2.Get(), platform::CPUPlace(), &t2); } else { - auto& s1 = var1.Get(); - auto& s2 = var2.Get(); + auto& s1 = var1.Get(); + auto& s2 = var2.Get(); if (s1.height() != s2.height()) { return false; @@ -166,9 +166,9 @@ static bool IsEqualVar(const framework::Variable& var1, return false; } - framework::TensorCopySync(var1.Get().value(), + framework::TensorCopySync(var1.Get().value(), platform::CPUPlace(), &t1); - framework::TensorCopySync(var2.Get().value(), + framework::TensorCopySync(var2.Get().value(), platform::CPUPlace(), &t2); } @@ -211,7 +211,7 @@ static framework::Variable RandomSelectedRows(framework::DDim dims, dims[0] = row_number; framework::Variable ret; - auto* sr = ret.GetMutable(); + auto* sr = ret.GetMutable(); auto tensor_var = RandomTensor(dims, place, low, high); sr->mutable_value()->ShareDataWith( tensor_var.template Get()); diff --git a/paddle/fluid/imperative/tests/test_layer.cc b/paddle/fluid/imperative/tests/test_layer.cc index 064f47f54979a1..c54ed34bb8108a 100644 --- a/paddle/fluid/imperative/tests/test_layer.cc +++ b/paddle/fluid/imperative/tests/test_layer.cc @@ -237,7 +237,7 @@ TEST(test_layer, test_debug_string) { std::shared_ptr selected_rows( new imperative::VarBase(false, "selected_rows")); auto tensor_sr = selected_rows->MutableVar() - ->GetMutable() + ->GetMutable() ->mutable_value(); std::string res_ui_sr = test_func(selected_rows); ASSERT_TRUE(res_ui_sr.find("NOT_INITED") != std::string::npos); diff --git a/paddle/fluid/imperative/tests/test_prepare_op.cc b/paddle/fluid/imperative/tests/test_prepare_op.cc index 5e269d74044d24..b4ff3cff38217a 100644 --- a/paddle/fluid/imperative/tests/test_prepare_op.cc +++ b/paddle/fluid/imperative/tests/test_prepare_op.cc @@ -101,7 +101,7 @@ const framework::Tensor* GetTensorFromVar(const framework::Variable& var); TEST(test_prepare_op, test_get_tensor_from_var) { std::shared_ptr vout_error( new imperative::VarBase(false, "vout_error")); - vout_error->MutableVar()->GetMutable(); + vout_error->MutableVar()->GetMutable(); auto* ts = GetTensorFromVar(*vout_error->MutableVar()); ASSERT_TRUE(ts != nullptr); } diff --git a/paddle/fluid/imperative/variable_wrapper.h b/paddle/fluid/imperative/variable_wrapper.h index c257191a546e43..bd96cd3f1aa178 100644 --- a/paddle/fluid/imperative/variable_wrapper.h +++ b/paddle/fluid/imperative/variable_wrapper.h @@ -104,8 +104,8 @@ class VariableWrapper { const framework::Tensor* tensor = nullptr; if (var_.IsType()) { tensor = &(var_.Get()); - } else if (var_.IsType()) { - tensor = &(var_.Get().value()); + } else if (var_.IsType()) { + tensor = &(var_.Get().value()); } else { PADDLE_THROW(platform::errors::PermissionDenied( "Only support LoDTensor and SelectedRows for gradient var")); @@ -153,7 +153,7 @@ class VariableWrapper { if (type_ == framework::proto::VarType::LOD_TENSOR) { tensor = &(var_.Get()); } else if (type_ == framework::proto::VarType::SELECTED_ROWS) { - tensor = &(var_.Get().value()); + tensor = &(var_.Get().value()); } else if (type_ == framework::proto::VarType::VOCAB) { const framework::Vocab* data = nullptr; data = &(var_.Get()); @@ -193,7 +193,7 @@ class VariableWrapper { if (type_ == framework::proto::VarType::LOD_TENSOR) { tensor = &(var_.Get()); } else if (type_ == framework::proto::VarType::SELECTED_ROWS) { - tensor = &(var_.Get().value()); + tensor = &(var_.Get().value()); } else { VLOG(6) << "Variable " << name_ << " is not initialized"; return place; diff --git a/paddle/fluid/inference/api/details/reset_tensor_array.h b/paddle/fluid/inference/api/details/reset_tensor_array.h index fa5997d92dd231..bd867ba54d2359 100644 --- a/paddle/fluid/inference/api/details/reset_tensor_array.h +++ b/paddle/fluid/inference/api/details/reset_tensor_array.h @@ -43,7 +43,7 @@ struct TensorArrayBatchCleaner { constexpr auto kLoDTensorId = framework::VarTypeTrait::kId; constexpr auto kSelectedRowsId = - framework::VarTypeTrait::kId; + framework::VarTypeTrait::kId; constexpr auto kFetchListId = framework::VarTypeTrait::kId; valid_types_.insert(kTensorId); diff --git a/paddle/fluid/memory/CMakeLists.txt b/paddle/fluid/memory/CMakeLists.txt index 97952e4b71641e..023b40518edf21 100644 --- a/paddle/fluid/memory/CMakeLists.txt +++ b/paddle/fluid/memory/CMakeLists.txt @@ -34,6 +34,13 @@ if (WITH_ROCM) DEPS device_context malloc) endif() +if(WITH_GPU AND WITH_TESTING AND NOT "$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON") + nv_test(get_base_ptr_test SRCS get_base_ptr_test.cu DEPS malloc gpu_info) + set_tests_properties(get_base_ptr_test PROPERTIES + ENVIRONMENT "FLAGS_allocator_strategy=auto_growth; + FLAGS_use_stream_safe_cuda_allocator=true;") +endif() + #if (WITH_GPU) # nv_test(pinned_memory_test SRCS pinned_memory_test.cu DEPS place memory) #endif() diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt index 939ad140415df4..c0d1934a703b66 100644 --- a/paddle/fluid/memory/allocation/CMakeLists.txt +++ b/paddle/fluid/memory/allocation/CMakeLists.txt @@ -125,10 +125,3 @@ if(NOT WIN32) cc_library(mmap_allocator SRCS mmap_allocator.cc DEPS allocator) cc_test(mmap_allocator_test SRCS mmap_allocator_test.cc DEPS mmap_allocator allocator) endif(NOT WIN32) - -if(WITH_GPU AND WITH_TESTING AND NOT "$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON") - nv_test(base_ptr_test SRCS base_ptr_test.cu DEPS malloc gpu_info) - set_tests_properties(base_ptr_test PROPERTIES - ENVIRONMENT "FLAGS_allocator_strategy=auto_growth; - FLAGS_use_stream_safe_cuda_allocator=true;") -endif() diff --git a/paddle/fluid/memory/allocation/allocator.h b/paddle/fluid/memory/allocation/allocator.h index 3f04d475163772..878633d1a62915 100644 --- a/paddle/fluid/memory/allocation/allocator.h +++ b/paddle/fluid/memory/allocation/allocator.h @@ -93,14 +93,7 @@ class Allocation : public pten::Allocation { const platform::Place& place) : pten::Allocation(ptr, size, place), base_ptr_(base_ptr) {} - void* base_ptr() const { - PADDLE_ENFORCE_EQ(FLAGS_allocator_strategy, "auto_growth", - paddle::platform::errors::Unimplemented( - "base_ptr() is only implemented for auto_growth " - "strategy, not support %s strategy", - FLAGS_allocator_strategy)); - return base_ptr_; - } + void* base_ptr() const { return base_ptr_; } private: inline void RegisterDecoratedAllocator(Allocator* allocator) { diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index 6615bdf4b138b4..7cdac0de6138f1 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -282,6 +282,10 @@ class AllocatorFacadePrivate { return iter->second; } + void* GetBasePtr(const std::shared_ptr& allocation) { + return static_cast(allocation.get())->base_ptr(); + } + #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) bool HasCUDAAllocator(const platform::CUDAPlace& place, const gpuStream_t& stream) { @@ -821,6 +825,21 @@ const std::shared_ptr& AllocatorFacade::GetAllocator( return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1); } +void* AllocatorFacade::GetBasePtr( + const std::shared_ptr& allocation) { + PADDLE_ENFORCE_EQ(GetAllocatorStrategy(), AllocatorStrategy::kAutoGrowth, + paddle::platform::errors::Unimplemented( + "GetBasePtr() is only implemented for auto_growth " + "strategy, not support allocator strategy: %d", + static_cast(GetAllocatorStrategy()))); + PADDLE_ENFORCE_EQ(platform::is_gpu_place(allocation->place()), true, + paddle::platform::errors::Unimplemented( + "GetBasePtr() is only implemented for CUDAPlace(), not " + "suppot place: %s", + allocation->place())); + return m_->GetBasePtr(allocation); +} + std::shared_ptr AllocatorFacade::AllocShared( const platform::Place& place, size_t size) { return std::shared_ptr(Alloc(place, size)); diff --git a/paddle/fluid/memory/allocation/allocator_facade.h b/paddle/fluid/memory/allocation/allocator_facade.h index 76e2f0b5a94f6d..a9b92e1801e4a3 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.h +++ b/paddle/fluid/memory/allocation/allocator_facade.h @@ -51,6 +51,8 @@ class AllocatorFacade { const std::shared_ptr& GetAllocator(const platform::Place& place); + void* GetBasePtr(const std::shared_ptr& allocation); + // Allocate a shared allocation. std::shared_ptr AllocShared(const platform::Place& place, size_t size); diff --git a/paddle/fluid/memory/allocation/base_ptr_test.cu b/paddle/fluid/memory/get_base_ptr_test.cu similarity index 80% rename from paddle/fluid/memory/allocation/base_ptr_test.cu rename to paddle/fluid/memory/get_base_ptr_test.cu index 5edabfcb9f5e7e..fe1d73b6028496 100644 --- a/paddle/fluid/memory/allocation/base_ptr_test.cu +++ b/paddle/fluid/memory/get_base_ptr_test.cu @@ -35,9 +35,9 @@ class CUDAAllocatoionBasePtrTest : public ::testing::Test { void OneByOneAllocTest() { for (size_t i = 0; i < alloc_times_; ++i) { size_t size = dis_(random_engine_); - AllocationPtr allocation = Alloc(place_, size); + auto allocation = AllocShared(place_, size); - void* base_ptr = static_cast(allocation.get())->base_ptr(); + void* base_ptr = GetBasePtr(allocation); void* system_ptr = platform::GetGpuBasePtr(allocation->ptr(), place_.GetDeviceId()); EXPECT_EQ(base_ptr, system_ptr); @@ -47,21 +47,21 @@ class CUDAAllocatoionBasePtrTest : public ::testing::Test { } void BatchByBatchAllocTest() { - std::vector allocations; + std::vector> allocations; allocations.reserve(batch_size_); size_t batch_num = alloc_times_ / batch_size_; for (size_t i = 0; i < batch_num; ++i) { for (size_t j = 0; j < batch_size_; ++j) { size_t size = dis_(random_engine_); - AllocationPtr allocation = Alloc(place_, size); + auto allocation = AllocShared(place_, size); - void* base_ptr = static_cast(allocation.get())->base_ptr(); + void* base_ptr = GetBasePtr(allocation); void* system_ptr = platform::GetGpuBasePtr(allocation->ptr(), place_.GetDeviceId()); EXPECT_EQ(base_ptr, system_ptr); - allocations.emplace_back(std::move(allocation)); + allocations.emplace_back(allocation); } allocations.clear(); } @@ -70,19 +70,19 @@ class CUDAAllocatoionBasePtrTest : public ::testing::Test { } void ContinuousAllocTest() { - std::vector allocations; + std::vector> allocations; allocations.reserve(alloc_times_); for (size_t i = 0; i < alloc_times_; ++i) { size_t size = dis_(random_engine_); - AllocationPtr allocation = Alloc(place_, size); + auto allocation = AllocShared(place_, size); - void* base_ptr = static_cast(allocation.get())->base_ptr(); + void* base_ptr = GetBasePtr(allocation); void* system_ptr = platform::GetGpuBasePtr(allocation->ptr(), place_.GetDeviceId()); EXPECT_EQ(base_ptr, system_ptr); - allocations.emplace_back(std::move(allocation)); + allocations.emplace_back(allocation); } allocations.clear(); @@ -90,8 +90,8 @@ class CUDAAllocatoionBasePtrTest : public ::testing::Test { } void ZeroSizeAllocTest() { - AllocationPtr allocation = Alloc(place_, 0); - void* base_ptr = static_cast(allocation.get())->base_ptr(); + auto allocation = AllocShared(place_, 0); + void* base_ptr = GetBasePtr(allocation); void* system_ptr = platform::GetGpuBasePtr(allocation->ptr(), place_.GetDeviceId()); EXPECT_EQ(base_ptr, system_ptr); diff --git a/paddle/fluid/memory/malloc.cc b/paddle/fluid/memory/malloc.cc index 3e859377e98d80..63c562be97fa07 100644 --- a/paddle/fluid/memory/malloc.cc +++ b/paddle/fluid/memory/malloc.cc @@ -47,6 +47,10 @@ bool InSameStream(const std::shared_ptr& allocation, stream); } +void* GetBasePtr(const std::shared_ptr& allocation) { + return allocation::AllocatorFacade::Instance().GetBasePtr(allocation); +} + #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) AllocationPtr Alloc(const platform::CUDAPlace& place, size_t size, const gpuStream_t& stream) { diff --git a/paddle/fluid/memory/malloc.h b/paddle/fluid/memory/malloc.h index 6443e91f08cbeb..855cbb775a1096 100644 --- a/paddle/fluid/memory/malloc.h +++ b/paddle/fluid/memory/malloc.h @@ -44,6 +44,8 @@ extern std::shared_ptr AllocShared(const platform::Place& place, extern bool InSameStream(const std::shared_ptr& allocation, const platform::Stream& stream); +extern void* GetBasePtr(const std::shared_ptr& allocation); + #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) extern AllocationPtr Alloc(const platform::CUDAPlace& place, size_t size, const gpuStream_t& stream); diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_xpu.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_xpu.cc index 979ae5c508c6b6..5d769214df4d15 100644 --- a/paddle/fluid/operators/amp/check_finite_and_unscale_op_xpu.cc +++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_xpu.cc @@ -94,11 +94,11 @@ class CheckFiniteAndUnscaleXPUKernel : public framework::OpKernel { inverse_scale = 0.0; } - paddle::platform::XPUVersion version = dev_ctx.xpu_version(); + auto version = dev_ctx.xpu_version(); framework::Tensor float_x; framework::Tensor float_out; if (std::is_same::value && - (version == paddle::platform::XPUVersion::XPU1)) { + (version == pten::backends::xpu::XPUVersion::XPU1)) { float_x.mutable_data(dev_ctx.GetPlace(), x->numel() * sizeof(MPDType)); float_out.mutable_data(dev_ctx.GetPlace(), diff --git a/paddle/fluid/operators/assign_op.h b/paddle/fluid/operators/assign_op.h index 5fe2ebb20745b2..1125bbe93c37a9 100644 --- a/paddle/fluid/operators/assign_op.h +++ b/paddle/fluid/operators/assign_op.h @@ -50,9 +50,8 @@ class AssignFunctor { } } - void operator()(const framework::SelectedRows &rows) const { - framework::SelectedRows &out_rows = - *out_->GetMutable(); + void operator()(const pten::SelectedRows &rows) const { + pten::SelectedRows &out_rows = *out_->GetMutable(); out_rows.set_rows(rows.rows()); out_rows.set_height(rows.height()); auto &t = rows.value(); diff --git a/paddle/fluid/operators/assign_op_test.cc b/paddle/fluid/operators/assign_op_test.cc index 3504ec37d6670b..efc1ed9e2ee604 100644 --- a/paddle/fluid/operators/assign_op_test.cc +++ b/paddle/fluid/operators/assign_op_test.cc @@ -87,7 +87,7 @@ TEST(AssignOp, AssignSelectedRows) { std::vector rows{0, 4, 7}; int64_t height = 10; - paddle::framework::SelectedRows input(rows, height); + pten::SelectedRows input(rows, height); paddle::framework::Tensor* input_tensor = input.mutable_value(); paddle::framework::DDim in_dims = paddle::framework::make_ddim({3, 4}); @@ -98,7 +98,7 @@ TEST(AssignOp, AssignSelectedRows) { assign_functor(input); - auto& out_selected_row = output.Get(); + auto& out_selected_row = output.Get(); const paddle::framework::Vector& out_rows = out_selected_row.rows(); EXPECT_EQ(rows.size(), out_rows.size()); for (size_t i = 0; i < rows.size(); ++i) { diff --git a/paddle/fluid/operators/clip_by_norm_op.cu b/paddle/fluid/operators/clip_by_norm_op.cu index 368fbe836c266c..4d04fdc8ce2d2c 100644 --- a/paddle/fluid/operators/clip_by_norm_op.cu +++ b/paddle/fluid/operators/clip_by_norm_op.cu @@ -36,21 +36,22 @@ class ClipByNormKernel output = context.Output("Out"); output->mutable_data(context.GetPlace()); - } else if (in_var->IsType()) { - auto* x = context.Input("X"); + } else if (in_var->IsType()) { + auto* x = context.Input("X"); // merge ids in selected rows first math::scatter::MergeAdd merge_func; - SelectedRows* merged_input = + pten::SelectedRows* merged_input = const_cast(context.scope()) .Var() - ->GetMutable(); + ->GetMutable(); merge_func(context.template device_context(), *x, merged_input); input = &(merged_input->value()); - SelectedRows* output_selected_rows = context.Output("Out"); + pten::SelectedRows* output_selected_rows = + context.Output("Out"); output_selected_rows->set_rows(merged_input->rows()); output_selected_rows->set_height(merged_input->height()); output = output_selected_rows->mutable_value(); diff --git a/paddle/fluid/operators/clip_by_norm_op.h b/paddle/fluid/operators/clip_by_norm_op.h index adb2a2fcfa3a70..fb21e98efec2c7 100644 --- a/paddle/fluid/operators/clip_by_norm_op.h +++ b/paddle/fluid/operators/clip_by_norm_op.h @@ -24,7 +24,7 @@ namespace paddle { namespace operators { using Tensor = framework::Tensor; -using SelectedRows = framework::SelectedRows; +// using SelectedRows = pten::SelectedRows; template using EigenVector = framework::EigenVector; @@ -43,20 +43,21 @@ class ClipByNormKernel : public framework::OpKernel { output = context.Output("Out"); output->mutable_data(context.GetPlace()); - } else if (in_var->IsType()) { - auto* x = context.Input("X"); + } else if (in_var->IsType()) { + auto* x = context.Input("X"); // merge ids in selected rows first math::scatter::MergeAdd merge_func; - SelectedRows* merged_input = + pten::SelectedRows* merged_input = const_cast(context.scope()) .Var() - ->GetMutable(); + ->GetMutable(); merge_func(context.template device_context(), *x, merged_input); input = &(merged_input->value()); - SelectedRows* output_selected_rows = context.Output("Out"); + pten::SelectedRows* output_selected_rows = + context.Output("Out"); output_selected_rows->set_rows(merged_input->rows()); output_selected_rows->set_height(merged_input->height()); output = output_selected_rows->mutable_value(); diff --git a/paddle/fluid/operators/clip_op.h b/paddle/fluid/operators/clip_op.h index fb41dc16d65129..5aff62656fb0f4 100644 --- a/paddle/fluid/operators/clip_op.h +++ b/paddle/fluid/operators/clip_op.h @@ -113,9 +113,9 @@ class ClipKernel : public framework::OpKernel { trans(context.template device_context(), x_data, x_data + numel, out_data, ClipFunctor(min, max)); } - } else if (x_var->IsType()) { - auto* x = context.Input("X"); - auto* out = context.Output("Out"); + } else if (x_var->IsType()) { + auto* x = context.Input("X"); + auto* out = context.Output("Out"); PADDLE_ENFORCE_NE(x, out, platform::errors::InvalidArgument( "Inplace clip is not allowed " "when x is SelectedRows")); diff --git a/paddle/fluid/operators/dropout_op_xpu.cc b/paddle/fluid/operators/dropout_op_xpu.cc index cded525b030d8d..e80797bd9b971a 100644 --- a/paddle/fluid/operators/dropout_op_xpu.cc +++ b/paddle/fluid/operators/dropout_op_xpu.cc @@ -107,8 +107,8 @@ class DropoutGradXPUKernel : public framework::OpKernel { return; } - paddle::platform::XPUVersion version = dev_ctx.xpu_version(); - if (version == paddle::platform::XPUVersion::XPU1) { + auto version = dev_ctx.xpu_version(); + if (version == pten::backends::xpu::XPUVersion::XPU1) { xpu::ctx_guard RAII_GUARD(dev_ctx.x_context()); XPUType* mask_new = RAII_GUARD.alloc_l3_or_gm(mask->numel()); float scale = diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu index 0c7d12ae0ad55c..8923f1fd4b8662 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu @@ -32,7 +32,7 @@ class ElementwiseMulKernel ctx.InputName("X"))); const auto& cuda_ctx = ctx.template device_context(); - if (x_var->IsType()) { + if (x_var->IsType()) { framework::Tensor x_for_selectedrows; std::vector ins; std::vector outs; diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.h b/paddle/fluid/operators/elementwise/elementwise_mul_op.h index e7a5e48b1f1b55..40faf7cbbe8cd8 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.h @@ -92,20 +92,20 @@ class ElementwiseMulKernel : public framework::OpKernel { auto* y = ctx.Input("Y"); framework::Tensor x, *z; - if (x_var->IsType()) { + if (x_var->IsType()) { PADDLE_ENFORCE_EQ(y->dims().size() == 1 && y->dims()[0] == 1, true, platform::errors::InvalidArgument( "For elementwise_op, if X is Sparse, Y must be " "scalar. But reveived the size of Y = %s.", y->dims().size())); - auto& x_sele = x_var->Get(); - auto out_sele = ctx.Output("Out"); + auto& x_sele = x_var->Get(); + auto out_sele = ctx.Output("Out"); x = x_sele.value(); out_sele->set_rows(x_sele.rows()); out_sele->set_height(x_sele.height()); out_sele->mutable_value()->Resize(x_sele.value().dims()); out_sele->mutable_value()->mutable_data(ctx.GetPlace(), x.type()); - z = ctx.Output("Out")->mutable_value(); + z = ctx.Output("Out")->mutable_value(); z->mutable_data(ctx.GetPlace()); auto dims_equal = x.dims() == y->dims(); if (dims_equal) { diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h index 5f4212e556f241..fdf04181de76c6 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_function.h +++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h @@ -84,7 +84,7 @@ int PackTensorsIntoVector(const framework::ExecutionContext &ctx, auto *x = ctx.Input("X"); z = ctx.Output("Out"); ins->emplace_back(x); - } else if (x_var->IsType()) { + } else if (x_var->IsType()) { PADDLE_ENFORCE_EQ(y->dims().size() == 1 && y->dims()[0] == 1, true, platform::errors::InvalidArgument( "For elementwise_op, if X is Sparse, Y must be " @@ -96,15 +96,15 @@ int PackTensorsIntoVector(const framework::ExecutionContext &ctx, "The parameter x_for_selectedrows is excepted to " "be valid, once input varible X`s class type is " "SelectedRows.\n")); - auto &x_sele = x_var->Get(); - auto out_sele = ctx.Output("Out"); + auto &x_sele = x_var->Get(); + auto out_sele = ctx.Output("Out"); *x_for_selectedrows = x_sele.value(); out_sele->set_rows(x_sele.rows()); out_sele->set_height(x_sele.height()); out_sele->mutable_value()->Resize(x_sele.value().dims()); out_sele->mutable_value()->mutable_data(ctx.GetPlace(), x_for_selectedrows->type()); - z = ctx.Output("Out")->mutable_value(); + z = ctx.Output("Out")->mutable_value(); ins->emplace_back(x_for_selectedrows); } else { PADDLE_THROW(platform::errors::InvalidArgument( diff --git a/paddle/fluid/operators/fill_constant_op.cc b/paddle/fluid/operators/fill_constant_op.cc index ee7c0eb96eae5c..c0e2b4584d0260 100644 --- a/paddle/fluid/operators/fill_constant_op.cc +++ b/paddle/fluid/operators/fill_constant_op.cc @@ -117,7 +117,7 @@ class FillConstantOp : public framework::OperatorWithKernel { const auto& str_value = ctx.Attr("str_value"); value = str_value.empty() ? "value" : "str_value"; } - if (!ctx.OutputVar("Out")->IsType()) { + if (!ctx.OutputVar("Out")->IsType()) { return framework::KernelSignature("full", {}, {shape, value}, {"Out"}); } return framework::KernelSignature("fill_constant.unregistered", {}, {}, {}); diff --git a/paddle/fluid/operators/fill_constant_op.h b/paddle/fluid/operators/fill_constant_op.h index 9e9bd2e0fbbc94..c74cf2a824c830 100644 --- a/paddle/fluid/operators/fill_constant_op.h +++ b/paddle/fluid/operators/fill_constant_op.h @@ -92,8 +92,8 @@ class FillConstantKernel : public framework::OpKernel { if (out_var->IsType()) { tensor = out_var->GetMutable(); tensor->Resize(shape); - } else if (out_var->IsType()) { - tensor = out_var->GetMutable()->mutable_value(); + } else if (out_var->IsType()) { + tensor = out_var->GetMutable()->mutable_value(); tensor->Resize(shape); } else { PADDLE_THROW(platform::errors::Unimplemented( diff --git a/paddle/fluid/operators/filter_by_instag_op.h b/paddle/fluid/operators/filter_by_instag_op.h index fa0cab04168d1e..1402f3404fd6de 100644 --- a/paddle/fluid/operators/filter_by_instag_op.h +++ b/paddle/fluid/operators/filter_by_instag_op.h @@ -29,7 +29,7 @@ namespace paddle { namespace operators { using Tensor = framework::Tensor; -using SelectedRows = framework::SelectedRows; +using SelectedRows = pten::SelectedRows; using LoDTensor = framework::LoDTensor; template diff --git a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h index 4e4322947a8571..fc782dc5511751 100644 --- a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h +++ b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h @@ -30,7 +30,7 @@ namespace operators { using Tensor = framework::Tensor; using LoDTensor = framework::LoDTensor; -using SelectedRows = framework::SelectedRows; +using SelectedRows = pten::SelectedRows; using DDim = framework::DDim; constexpr int64_t kNoPadding = -1; @@ -200,8 +200,8 @@ class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel { DDim table_dim; if (table_var->IsType()) { table_dim = context.Input("W")->dims(); - } else if (table_var->IsType()) { - auto *table_t = context.Input("W"); + } else if (table_var->IsType()) { + auto *table_t = context.Input("W"); table_dim = table_t->value().dims(); } else { PADDLE_THROW(platform::errors::PermissionDenied( @@ -215,7 +215,8 @@ class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel { if (is_sparse) { auto *ids = context.Input("Ids"); auto *d_output = context.Input(framework::GradVarName("Out")); - auto *d_table = context.Output(framework::GradVarName("W")); + auto *d_table = + context.Output(framework::GradVarName("W")); // runtime shape d_table->set_height(table_dim[0]); diff --git a/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc b/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc index 8ce7df7eec15ea..67c265c97e4616 100644 --- a/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc +++ b/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc @@ -57,7 +57,7 @@ class GetTensorFromSelectedRowsOp : public framework::OperatorWithKernel { class GetTensorFromSelectedRowsKernel { public: void operator()(const framework::ExecutionContext &ctx) const { - auto *x = ctx.Input("X"); + auto *x = ctx.Input("X"); auto *out = ctx.Output("Out"); out->Resize(x->value().dims()); diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.h b/paddle/fluid/operators/hierarchical_sigmoid_op.h index a6f5fb017a752e..17734b9c542c83 100644 --- a/paddle/fluid/operators/hierarchical_sigmoid_op.h +++ b/paddle/fluid/operators/hierarchical_sigmoid_op.h @@ -204,7 +204,7 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel { "Custom tree must be set for sparse mode!")); framework::Vector real_rows = PathToRows(*path); auto* w_grad = - ctx.Output(framework::GradVarName("W")); + ctx.Output(framework::GradVarName("W")); w_grad->set_rows(real_rows); // Build a map of id -> row_index to speed up finding the index of one id w_grad->set_height(w.dims()[0]); diff --git a/paddle/fluid/operators/interpolate_v2_op.cu b/paddle/fluid/operators/interpolate_v2_op.cu index 3db0fdf5e6da4e..72dd0fc7432471 100644 --- a/paddle/fluid/operators/interpolate_v2_op.cu +++ b/paddle/fluid/operators/interpolate_v2_op.cu @@ -16,39 +16,121 @@ #include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/fluid/platform/fast_divmod.h" namespace paddle { namespace operators { using framework::Tensor; +using platform::FastDivMod; using DataLayout = framework::DataLayout; +static inline int GetLastPow2(int n) { + n |= (n >> 1); + n |= (n >> 2); + n |= (n >> 4); + n |= (n >> 8); + n |= (n >> 16); + return std::max(1, n - (n >> 1)); +} + +inline platform::GpuLaunchConfig GetGpuLaunchConfig3D( + const platform::CUDADeviceContext& context, int num_img, int height, + int width) { + const int kThreadsPerBlock = 256; + int max_threads_per_block = context.GetMaxThreadsPerBlock(); // 1024 + int max_threads = std::min(kThreadsPerBlock, max_threads_per_block); + + int block_x = std::min(GetLastPow2(width), max_threads); + int block_y = std::min(GetLastPow2(height), max_threads / block_x); + int block_z = std::min(num_img, max_threads / block_x / block_y); + + dim3 max_grid_dim = context.GetCUDAMaxGridDimSize(); + int grid_x = std::min(max_grid_dim.x, platform::DivUp(width, block_x)); + int grid_y = std::min(max_grid_dim.y, platform::DivUp(height, block_y)); + int grid_z = + std::min(max_grid_dim.z, platform::DivUp(num_img, block_z * 4)); + + const int capability = context.GetComputeCapability(); + platform::GpuLaunchConfig config; + config.compute_capability = capability; + config.thread_per_block = dim3(block_x, block_y, block_z); + config.block_per_grid = dim3(grid_x, grid_y, grid_z); + return config; +} + +struct FastDivModForInterpolate { + public: + FastDivMod channels_div; + FastDivMod output_w_div; + FastDivMod output_wc_div; + + explicit HOSTDEVICE FastDivModForInterpolate(const int channels, + const int output_w, + const int outout_wc) + : channels_div(FastDivMod(channels)), + output_w_div(FastDivMod(output_w)), + output_wc_div(FastDivMod(outout_wc)) {} +}; + +template +__global__ void KeNearestNeighborInterpNCHWFw( + const T* in, const size_t in_img_h, const size_t in_img_w, T* out, + const size_t out_img_h, const size_t out_img_w, const size_t nc, + const float ratio_h, const float ratio_w, const bool align_corners) { + int out_img_idx = threadIdx.x + blockIdx.x * blockDim.x; + int out_img_idy = threadIdx.y + blockIdx.y * blockDim.y; + int nc_id = threadIdx.z + blockIdx.z * blockDim.z; + int nc_stride = blockDim.z * gridDim.z; + + // nearest_sampling by multiple read in_addr and write to out_addr + int in_img_idx = (align_corners) + ? static_cast(ratio_w * out_img_idx + 0.5) + : static_cast(ratio_w * out_img_idx); + int in_img_idy = (align_corners) + ? static_cast(ratio_h * out_img_idy + 0.5) + : static_cast(ratio_h * out_img_idy); + + int in_index = (nc_id * in_img_h + in_img_idy) * in_img_w + in_img_idx; + int in_index_stride = nc_stride * in_img_h * in_img_w; + + int out_index = (nc_id * out_img_h + out_img_idy) * out_img_w + out_img_idx; + int out_index_stride = nc_stride * out_img_h * out_img_w; + + // prevent from multiple threads writing + if (out_img_idx < out_img_w && out_img_idy < out_img_h) { + while (nc_id < nc) { + out[out_index] = in[in_index]; + in_index += in_index_stride; + out_index += out_index_stride; + nc_id += nc_stride; + } + } +} + template __global__ void KeNearestNeighborInterpFw( const T* in, const size_t in_img_h, const size_t in_img_w, const size_t input_h, const size_t input_w, T* out, const size_t out_img_h, const size_t out_img_w, const size_t output_h, const size_t output_w, const size_t num_channels, const float ratio_h, const float ratio_w, - const bool align_corners, const DataLayout data_layout) { + const bool align_corners, FastDivModForInterpolate divmods) { int nthreads = output_h * output_w; int tid = blockIdx.x * blockDim.x + threadIdx.x; int stride = blockDim.x * gridDim.x; + int in_img_size = in_img_h * in_img_w; + int out_img_size = out_img_h * out_img_w; + for (; tid < nthreads; tid += stride) { - int out_id_h = tid / output_w; - int out_id_w = tid % output_w; - int in_img_size = input_w / num_channels; - int out_img_size = output_w / num_channels; + auto out_id_divmod = divmods.output_w_div.Divmod(tid); + int out_id_h = out_id_divmod.val[0]; + int out_id_w = out_id_divmod.val[1]; - int channel_id, out_img_idy, out_img_idx; - if (data_layout == DataLayout::kNCHW) { - channel_id = out_id_w / out_img_size; - out_img_idy = (out_id_w % out_img_size) / out_img_w; - out_img_idx = tid % out_img_w; - } else { - out_img_idy = out_id_w / (out_img_w * num_channels); - out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels; - channel_id = tid % num_channels; - } + int channel_id = divmods.channels_div.Divmod(tid).val[1]; + auto outimg_id_divmod = divmods.output_wc_div.Divmod(out_id_w); + int out_img_idy = outimg_id_divmod.val[0]; + int out_img_idx = + divmods.channels_div.Divmod(outimg_id_divmod.val[1]).val[0]; int in_img_idy = (align_corners) ? static_cast(ratio_h * out_img_idy + 0.5) @@ -57,13 +139,8 @@ __global__ void KeNearestNeighborInterpFw( ? static_cast(ratio_w * out_img_idx + 0.5) : static_cast(ratio_w * out_img_idx); - if (data_layout == DataLayout::kNCHW) { - out[tid] = in[out_id_h * input_w + channel_id * in_img_size + - in_img_idy * in_img_w + in_img_idx]; - } else { - out[tid] = in[out_id_h * input_w + in_img_idy * in_img_w * num_channels + - in_img_idx * num_channels + channel_id]; - } + out[tid] = in[out_id_h * input_w + in_img_idy * in_img_w * num_channels + + in_img_idx * num_channels + channel_id]; } } @@ -1292,11 +1369,25 @@ static void Interpolate2DCUDAFwd(const framework::ExecutionContext& ctx, platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), pixelNum); if ("nearest" == interp_method) { - KeNearestNeighborInterpFw< - T><<>>( - input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n, - out_chw, c, ratio_h, ratio_w, align_corners, data_layout); + if (data_layout == DataLayout::kNCHW) { + // get launch 3D config + int nc = n * c; + platform::GpuLaunchConfig config_3d = + GetGpuLaunchConfig3D(ctx.cuda_device_context(), nc, out_h, out_w); + KeNearestNeighborInterpNCHWFw< + T><<>>( + input_data, in_h, in_w, output_data, out_h, out_w, nc, ratio_h, + ratio_w, align_corners); + } else { + int64_t cw = c * out_w; + auto interp_divmods = FastDivModForInterpolate(c, out_chw, cw); + KeNearestNeighborInterpFw< + T><<>>( + input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n, + out_chw, c, ratio_h, ratio_w, align_corners, interp_divmods); + } } else if ("bilinear" == interp_method) { dim3 thread_num = config.thread_per_block; #ifdef WITH_NV_JETSON diff --git a/paddle/fluid/operators/isfinite_op.cc b/paddle/fluid/operators/isfinite_op.cc index 753b34484e4112..c4bc3a7fda154f 100644 --- a/paddle/fluid/operators/isfinite_op.cc +++ b/paddle/fluid/operators/isfinite_op.cc @@ -55,8 +55,8 @@ class OverflowOp : public framework::OperatorWithKernel { auto *x_var = ctx.InputVar("X"); if (x_var->IsType()) { dtype = x_var->Get().type(); - } else if (x_var->IsType()) { - dtype = x_var->Get().value().type(); + } else if (x_var->IsType()) { + dtype = x_var->Get().value().type(); } else { PADDLE_ENFORCE_EQ( true, false, diff --git a/paddle/fluid/operators/isfinite_op.h b/paddle/fluid/operators/isfinite_op.h index 99db1c7e081dad..abed0e6903dd39 100644 --- a/paddle/fluid/operators/isfinite_op.h +++ b/paddle/fluid/operators/isfinite_op.h @@ -58,8 +58,8 @@ class OverflowKernel : public framework::OpKernel { if (x->IsType()) { auto* in = ctx.Input("X"); functor(*in, out); - } else if (x->IsType()) { - auto& in = ctx.Input("X")->value(); + } else if (x->IsType()) { + auto& in = ctx.Input("X")->value(); functor(in, out); } else { PADDLE_ENFORCE_EQ( diff --git a/paddle/fluid/operators/isfinite_v2_op.cc b/paddle/fluid/operators/isfinite_v2_op.cc index 316197ac23c850..3b48a41ed4f75e 100644 --- a/paddle/fluid/operators/isfinite_v2_op.cc +++ b/paddle/fluid/operators/isfinite_v2_op.cc @@ -62,8 +62,8 @@ class OverflowV2Op : public framework::OperatorWithKernel { auto *x_var = ctx.InputVar("X"); if (x_var->IsType()) { dtype = x_var->Get().type(); - } else if (x_var->IsType()) { - dtype = x_var->Get().value().type(); + } else if (x_var->IsType()) { + dtype = x_var->Get().value().type(); } else { PADDLE_THROW(plat::errors::InvalidArgument( "Cannot find the input data type by all input data")); diff --git a/paddle/fluid/operators/load_op.h b/paddle/fluid/operators/load_op.h index 66160695c3d5aa..89ad4325a5a534 100644 --- a/paddle/fluid/operators/load_op.h +++ b/paddle/fluid/operators/load_op.h @@ -50,7 +50,7 @@ class LoadOpKernel : public framework::OpKernel { if (out_var->IsType()) { LoadLodTensor(fin, place, out_var, ctx); - } else if (out_var->IsType()) { + } else if (out_var->IsType()) { LoadSelectedRows(fin, place, out_var); } else { PADDLE_THROW(platform::errors::InvalidArgument( @@ -105,7 +105,7 @@ class LoadOpKernel : public framework::OpKernel { void LoadSelectedRows(std::istream &fin, const platform::Place &place, framework::Variable *var) const { - auto *selectedRows = var->GetMutable(); + auto *selectedRows = var->GetMutable(); // get device context from pool platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto &dev_ctx = *pool.Get(place); diff --git a/paddle/fluid/operators/lookup_table_dequant_op.h b/paddle/fluid/operators/lookup_table_dequant_op.h index 70aad1d3238f2f..475d0922ccc693 100644 --- a/paddle/fluid/operators/lookup_table_dequant_op.h +++ b/paddle/fluid/operators/lookup_table_dequant_op.h @@ -29,7 +29,7 @@ namespace operators { using Tensor = framework::Tensor; using LoDTensor = framework::LoDTensor; -using SelectedRows = framework::SelectedRows; +using SelectedRows = pten::SelectedRows; using DDim = framework::DDim; template diff --git a/paddle/fluid/operators/lookup_table_op.cu b/paddle/fluid/operators/lookup_table_op.cu index 89c84d9e143773..7a32e13122852c 100644 --- a/paddle/fluid/operators/lookup_table_op.cu +++ b/paddle/fluid/operators/lookup_table_op.cu @@ -151,7 +151,8 @@ class LookupTableGradCUDAKernel : public framework::OpKernel { auto *ids = context.Input("Ids"); auto *table = context.Input("W"); auto *d_output = context.Input(framework::GradVarName("Out")); - auto *d_table = context.Output(framework::GradVarName("W")); + auto *d_table = + context.Output(framework::GradVarName("W")); auto *ids_data = ids->data(); int64_t ids_num = ids->numel(); diff --git a/paddle/fluid/operators/lookup_table_op.h b/paddle/fluid/operators/lookup_table_op.h index a89d5fb7cb6e5d..91b7f91c8e3bc5 100644 --- a/paddle/fluid/operators/lookup_table_op.h +++ b/paddle/fluid/operators/lookup_table_op.h @@ -28,7 +28,7 @@ namespace operators { using Tensor = framework::Tensor; using LoDTensor = framework::LoDTensor; -using SelectedRows = framework::SelectedRows; +using SelectedRows = pten::SelectedRows; using DDim = framework::DDim; constexpr int64_t kNoPadding = -1; @@ -82,8 +82,8 @@ class LookupTableKernel : public framework::OpKernel { } } - } else if (table_var->IsType()) { - const auto &table_t = table_var->Get(); + } else if (table_var->IsType()) { + const auto &table_t = table_var->Get(); int64_t row_width = table_t.value().dims()[1]; const auto *table = table_t.value().data(); auto *output = output_t->mutable_data(context.GetPlace()); @@ -155,8 +155,8 @@ class LookupTableGradKernel : public framework::OpKernel { DDim table_dim; if (table_var->IsType()) { table_dim = context.Input("W")->dims(); - } else if (table_var->IsType()) { - auto *table_t = context.Input("W"); + } else if (table_var->IsType()) { + auto *table_t = context.Input("W"); table_dim = table_t->value().dims(); } else { PADDLE_THROW(platform::errors::InvalidArgument( @@ -171,7 +171,8 @@ class LookupTableGradKernel : public framework::OpKernel { if (is_sparse) { auto *ids = context.Input("Ids"); auto *d_output = context.Input(framework::GradVarName("Out")); - auto *d_table = context.Output(framework::GradVarName("W")); + auto *d_table = + context.Output(framework::GradVarName("W")); auto *ids_data = ids->data(); int64_t ids_num = ids->numel(); diff --git a/paddle/fluid/operators/lookup_table_v2_op.cu b/paddle/fluid/operators/lookup_table_v2_op.cu index 44a6151f1b6ce6..74ad0e4978b4ec 100644 --- a/paddle/fluid/operators/lookup_table_v2_op.cu +++ b/paddle/fluid/operators/lookup_table_v2_op.cu @@ -152,7 +152,8 @@ class LookupTableV2GradCUDAKernel : public framework::OpKernel { auto *ids = context.Input("Ids"); auto *table = context.Input("W"); auto *d_output = context.Input(framework::GradVarName("Out")); - auto *d_table = context.Output(framework::GradVarName("W")); + auto *d_table = + context.Output(framework::GradVarName("W")); auto *ids_data = ids->data(); int64_t ids_num = ids->numel(); diff --git a/paddle/fluid/operators/lookup_table_v2_op.h b/paddle/fluid/operators/lookup_table_v2_op.h index 54564395c6d04c..6ea9e58198fbff 100644 --- a/paddle/fluid/operators/lookup_table_v2_op.h +++ b/paddle/fluid/operators/lookup_table_v2_op.h @@ -29,7 +29,7 @@ namespace operators { using Tensor = framework::Tensor; using LoDTensor = framework::LoDTensor; -using SelectedRows = framework::SelectedRows; +using SelectedRows = pten::SelectedRows; using DDim = framework::DDim; constexpr int64_t kNoPadding = -1; @@ -86,8 +86,8 @@ class LookupTableV2Kernel : public framework::OpKernel { row_width * sizeof(T)); } } - } else if (table_var->IsType()) { - const auto &table_t = table_var->Get(); + } else if (table_var->IsType()) { + const auto &table_t = table_var->Get(); int64_t row_width = table_t.value().dims()[1]; const auto *table = table_t.value().data(); auto *output = output_t->mutable_data(context.GetPlace()); @@ -132,8 +132,8 @@ class LookupTableV2GradKernel : public framework::OpKernel { DDim table_dim; if (table_var->IsType()) { table_dim = context.Input("W")->dims(); - } else if (table_var->IsType()) { - auto *table_t = context.Input("W"); + } else if (table_var->IsType()) { + auto *table_t = context.Input("W"); table_dim = table_t->value().dims(); } else { PADDLE_THROW(platform::errors::InvalidArgument( @@ -148,7 +148,8 @@ class LookupTableV2GradKernel : public framework::OpKernel { if (is_sparse) { auto *ids_t = context.Input("Ids"); auto *d_output = context.Input(framework::GradVarName("Out")); - auto *d_table = context.Output(framework::GradVarName("W")); + auto *d_table = + context.Output(framework::GradVarName("W")); int64_t ids_num = ids_t->numel(); std::vector ids; diff --git a/paddle/fluid/operators/math/matrix_bit_code.cc b/paddle/fluid/operators/math/matrix_bit_code.cc index 7c50ba630dbd91..a94bb594be5f9d 100644 --- a/paddle/fluid/operators/math/matrix_bit_code.cc +++ b/paddle/fluid/operators/math/matrix_bit_code.cc @@ -227,11 +227,11 @@ template struct MatrixBitCodeFunctorMulGradWeightSR : public boost::static_visitor { const framework::Tensor &tmat_; - framework::SelectedRows *weight_; + pten::SelectedRows *weight_; const framework::Tensor &input_; MatrixBitCodeFunctorMulGradWeightSR(const framework::Tensor &tmat, - framework::SelectedRows *weight, + pten::SelectedRows *weight, const framework::Tensor &input) : tmat_(tmat), weight_(weight), input_(input) {} @@ -274,7 +274,7 @@ struct MatrixBitCodeFunctorMulGradWeightSR template void MatrixBitCodeFunctor::MulGradWeight(const framework::Tensor &tmat, - framework::SelectedRows *weight, + pten::SelectedRows *weight, const framework::Tensor &input) { MatrixBitCodeFunctorMulGradWeightSR func(tmat, weight, input); code_table_.apply_visitor(func); diff --git a/paddle/fluid/operators/math/matrix_bit_code.h b/paddle/fluid/operators/math/matrix_bit_code.h index 71d905214ab9f5..13ddd27cbf0d7b 100644 --- a/paddle/fluid/operators/math/matrix_bit_code.h +++ b/paddle/fluid/operators/math/matrix_bit_code.h @@ -252,8 +252,7 @@ class MatrixBitCodeFunctor { /* For SelectedRows Weight, For index(i, j) >= 0: weight.row(index(i, j)) += tmat(i, j) * input.row(i) */ - void MulGradWeight(const framework::Tensor& tmat, - framework::SelectedRows* weight, + void MulGradWeight(const framework::Tensor& tmat, pten::SelectedRows* weight, const framework::Tensor& input); /* For j < code_length input.row(i) += tmat(i, j) * weight.row(index(i, j)) diff --git a/paddle/fluid/operators/math/selected_rows_functor.cc b/paddle/fluid/operators/math/selected_rows_functor.cc index f6178eb0a1eb6e..8cd3e1367d86d9 100644 --- a/paddle/fluid/operators/math/selected_rows_functor.cc +++ b/paddle/fluid/operators/math/selected_rows_functor.cc @@ -24,9 +24,9 @@ namespace math { template struct SelectedRowsAdd { void operator()(const platform::CPUDeviceContext& context, - const framework::SelectedRows& input1, - const framework::SelectedRows& input2, - framework::SelectedRows* output) { + const pten::SelectedRows& input1, + const pten::SelectedRows& input2, + pten::SelectedRows* output) { auto in1_height = input1.height(); PADDLE_ENFORCE_EQ( in1_height, input2.height(), @@ -94,7 +94,7 @@ template struct SelectedRowsAdd; template struct SelectedRowsAddTensor { void operator()(const platform::CPUDeviceContext& context, - const framework::SelectedRows& input1, + const pten::SelectedRows& input1, const framework::Tensor& input2, framework::Tensor* output) { auto in1_height = input1.height(); auto in2_dims = input2.dims(); @@ -154,9 +154,8 @@ template struct SelectedRowsAddTensor; template struct SelectedRowsAddTo { void operator()(const platform::CPUDeviceContext& context, - const framework::SelectedRows& input1, - const int64_t input2_offset, - framework::SelectedRows* input2) { + const pten::SelectedRows& input1, const int64_t input2_offset, + pten::SelectedRows* input2) { auto in1_height = input1.height(); PADDLE_ENFORCE_EQ( in1_height, input2->height(), @@ -198,9 +197,9 @@ template struct SelectedRowsAddTo; template struct SelectedRowsSumTo { void operator()(const platform::CPUDeviceContext& context, - const std::vector& input1, + const std::vector& input1, const std::vector& input2_offsets, - framework::SelectedRows* input2) { + pten::SelectedRows* input2) { // Ensure all selected rows have the same height size_t size = 0u; for (auto iter = input1.begin(); iter != input1.end(); ++iter) { @@ -242,8 +241,7 @@ template struct SelectedRowsSumTo; template struct SelectedRowsAddToTensor { void operator()(const platform::CPUDeviceContext& context, - const framework::SelectedRows& input1, - framework::Tensor* input2) { + const pten::SelectedRows& input1, framework::Tensor* input2) { if (UNLIKELY(input1.rows().size() == 0)) { LOG(WARNING) << "input selected rows is empty!"; return; @@ -313,7 +311,7 @@ typename std::enable_if::value>::type elementwise_add_to( template typename std::enable_if::value>::type -add_sparse_inputs(const std::vector& inputs, +add_sparse_inputs(const std::vector& inputs, const std::unordered_map& rows_to_id, int64_t input_width, const platform::CPUDeviceContext& context, T* out_data) { @@ -347,7 +345,7 @@ add_sparse_inputs(const std::vector& inputs, template typename std::enable_if::value>::type -add_sparse_inputs(const std::vector& inputs, +add_sparse_inputs(const std::vector& inputs, const std::unordered_map& rows_to_id, int64_t input_width, const platform::CPUDeviceContext& context, T* out_data) { @@ -371,32 +369,31 @@ add_sparse_inputs(const std::vector& inputs, template struct MergeAdd { - framework::SelectedRows operator()(const platform::CPUDeviceContext& context, - const framework::SelectedRows& input, - const bool sorted_result = false) { - framework::SelectedRows out; + pten::SelectedRows operator()(const platform::CPUDeviceContext& context, + const pten::SelectedRows& input, + const bool sorted_result = false) { + pten::SelectedRows out; (*this)(context, input, &out, sorted_result); return out; } void operator()(const platform::CPUDeviceContext& context, - const framework::SelectedRows& input, - framework::SelectedRows* output, + const pten::SelectedRows& input, pten::SelectedRows* output, const bool sorted_result = false) { - std::vector inputs; + std::vector inputs; inputs.push_back(&input); (*this)(context, inputs, output, sorted_result); } void operator()(const platform::CPUDeviceContext& context, - const std::vector& inputs, - framework::SelectedRows* output, + const std::vector& inputs, + pten::SelectedRows* output, const bool sorted_result = false) { if (inputs.size() == 0) { VLOG(3) << "no input! return"; return; } - const framework::SelectedRows* has_value_input = nullptr; + const pten::SelectedRows* has_value_input = nullptr; for (auto* in : inputs) { if (in->rows().size() > 0) { has_value_input = in; @@ -409,7 +406,7 @@ struct MergeAdd { } auto input_width = has_value_input->value().dims()[1]; auto input_height = has_value_input->height(); - framework::SelectedRows& out = *output; + pten::SelectedRows& out = *output; std::set merged_row_set; size_t row_num = 0; for (auto* input : inputs) { @@ -480,24 +477,23 @@ struct MergeAdd { #ifdef PADDLE_WITH_XPU template struct MergeAdd { - framework::SelectedRows operator()(const platform::XPUDeviceContext& context, - const framework::SelectedRows& input, - const bool sorted_result = false) { - framework::SelectedRows out; + pten::SelectedRows operator()(const platform::XPUDeviceContext& context, + const pten::SelectedRows& input, + const bool sorted_result = false) { + pten::SelectedRows out; (*this)(context, input, &out, sorted_result); return out; } void operator()(const platform::XPUDeviceContext& context, - const framework::SelectedRows& input, - framework::SelectedRows* output, + const pten::SelectedRows& input, pten::SelectedRows* output, const bool sorted_result = false) { framework::Vector input_rows(input.rows()); if (input_rows.size() == 0) { return; } - framework::SelectedRows& out = *output; + pten::SelectedRows& out = *output; std::set row_set(input_rows.begin(), input_rows.end()); std::vector merge_rows(row_set.begin(), row_set.end()); auto input_width = input.value().dims()[1]; @@ -537,14 +533,14 @@ struct MergeAdd { } void operator()(const platform::XPUDeviceContext& context, - const std::vector& inputs, - framework::SelectedRows* output, + const std::vector& inputs, + pten::SelectedRows* output, const bool sorted_result = false) { if (inputs.size() == 0) { VLOG(3) << "no input! return"; return; } - const framework::SelectedRows* has_value_input = nullptr; + const pten::SelectedRows* has_value_input = nullptr; for (auto* in : inputs) { if (in->rows().size() > 0) { has_value_input = in; @@ -557,7 +553,7 @@ struct MergeAdd { } auto input_width = has_value_input->value().dims()[1]; auto input_height = has_value_input->height(); - framework::SelectedRows& out = *output; + pten::SelectedRows& out = *output; std::set merged_row_set; size_t row_num = 0; for (auto* input : inputs) { @@ -628,29 +624,28 @@ struct MergeAdd { #endif template struct MergeAverage { - framework::SelectedRows operator()(const platform::CPUDeviceContext& context, - const framework::SelectedRows& input) { - framework::SelectedRows out; + pten::SelectedRows operator()(const platform::CPUDeviceContext& context, + const pten::SelectedRows& input) { + pten::SelectedRows out; (*this)(context, input, &out); return out; } void operator()(const platform::CPUDeviceContext& context, - const framework::SelectedRows& input, - framework::SelectedRows* output) { - std::vector inputs; + const pten::SelectedRows& input, pten::SelectedRows* output) { + std::vector inputs; inputs.push_back(&input); (*this)(context, inputs, output); } void operator()(const platform::CPUDeviceContext& context, - const std::vector& inputs, - framework::SelectedRows* output) { + const std::vector& inputs, + pten::SelectedRows* output) { if (inputs.size() == 0) { VLOG(3) << "no input! return"; return; } - const framework::SelectedRows* has_value_input = nullptr; + const pten::SelectedRows* has_value_input = nullptr; for (auto* in : inputs) { if (in->rows().size() > 0) { has_value_input = in; @@ -663,7 +658,7 @@ struct MergeAverage { } auto input_width = has_value_input->value().dims()[1]; auto input_height = has_value_input->height(); - framework::SelectedRows& out = *output; + pten::SelectedRows& out = *output; std::set merged_row_set; size_t row_num = 0; for (auto* input : inputs) { @@ -750,7 +745,7 @@ template struct MergeAverage; template struct UpdateToTensor { void operator()(const platform::CPUDeviceContext& context, - const ScatterOps& op, const framework::SelectedRows& input1, + const ScatterOps& op, const pten::SelectedRows& input1, framework::Tensor* input2) { auto in1_height = input1.height(); auto in2_dims = input2->dims(); diff --git a/paddle/fluid/operators/math/selected_rows_functor.cu b/paddle/fluid/operators/math/selected_rows_functor.cu index 654a5653cbed1f..2ae2aaebb6c532 100644 --- a/paddle/fluid/operators/math/selected_rows_functor.cu +++ b/paddle/fluid/operators/math/selected_rows_functor.cu @@ -26,9 +26,9 @@ namespace math { template struct SelectedRowsAdd { void operator()(const platform::CUDADeviceContext& context, - const framework::SelectedRows& input1, - const framework::SelectedRows& input2, - framework::SelectedRows* output) { + const pten::SelectedRows& input1, + const pten::SelectedRows& input2, + pten::SelectedRows* output) { auto in1_height = input1.height(); PADDLE_ENFORCE_EQ( in1_height, input2.height(), @@ -117,7 +117,7 @@ __global__ void SelectedRowsAddTensorKernel(const T* selected_rows, template struct SelectedRowsAddTensor { void operator()(const platform::CUDADeviceContext& context, - const framework::SelectedRows& input1, + const pten::SelectedRows& input1, const framework::Tensor& input2, framework::Tensor* output) { auto in1_height = input1.height(); auto in2_dims = input2.dims(); @@ -182,9 +182,8 @@ template struct SelectedRowsAddTensor struct SelectedRowsAddTo { void operator()(const platform::CUDADeviceContext& context, - const framework::SelectedRows& input1, - const int64_t input2_offset, - framework::SelectedRows* input2) { + const pten::SelectedRows& input1, const int64_t input2_offset, + pten::SelectedRows* input2) { auto in1_height = input1.height(); PADDLE_ENFORCE_EQ( in1_height, input2->height(), @@ -250,8 +249,7 @@ __global__ void SelectedRowsAddToTensorKernel(const T* selected_rows, template struct SelectedRowsAddToTensor { void operator()(const platform::CUDADeviceContext& context, - const framework::SelectedRows& input1, - framework::Tensor* input2) { + const pten::SelectedRows& input1, framework::Tensor* input2) { auto in1_height = input1.height(); auto in2_dims = input2->dims(); PADDLE_ENFORCE_EQ( @@ -320,24 +318,23 @@ __global__ void MergeAddKernel(const T* input, const int64_t* input_rows, template struct MergeAdd { - framework::SelectedRows operator()(const platform::CUDADeviceContext& context, - const framework::SelectedRows& input, - const bool sorted_result = false) { - framework::SelectedRows out; + pten::SelectedRows operator()(const platform::CUDADeviceContext& context, + const pten::SelectedRows& input, + const bool sorted_result = false) { + pten::SelectedRows out; (*this)(context, input, &out); return out; } void operator()(const platform::CUDADeviceContext& context, - const framework::SelectedRows& input, - framework::SelectedRows* output, + const pten::SelectedRows& input, pten::SelectedRows* output, const bool sorted_result = false) { framework::Vector input_rows(input.rows()); if (input_rows.size() == 0) { return; } - framework::SelectedRows& out = *output; + pten::SelectedRows& out = *output; std::set row_set(input_rows.begin(), input_rows.end()); std::vector merge_rows_cpu(row_set.begin(), row_set.end()); framework::Vector merge_rows(merge_rows_cpu); @@ -368,14 +365,14 @@ struct MergeAdd { } void operator()(const platform::CUDADeviceContext& context, - const std::vector& inputs, - framework::SelectedRows* output, + const std::vector& inputs, + pten::SelectedRows* output, const bool sorted_result = false) { if (inputs.size() == 0) { VLOG(3) << "no input! return"; return; } - const framework::SelectedRows* has_value_input = nullptr; + const pten::SelectedRows* has_value_input = nullptr; for (auto* in : inputs) { if (in->rows().size() > 0) { has_value_input = in; @@ -388,7 +385,7 @@ struct MergeAdd { } auto input_width = has_value_input->value().dims()[1]; auto input_height = has_value_input->height(); - framework::SelectedRows& out = *output; + pten::SelectedRows& out = *output; std::set merged_row_set; for (auto* input : inputs) { if (input->rows().size() == 0) { @@ -499,7 +496,7 @@ __global__ void UpdateToTensorKernel(const T* selected_rows, template struct UpdateToTensor { void operator()(const platform::CUDADeviceContext& context, - const ScatterOps& op, const framework::SelectedRows& input1, + const ScatterOps& op, const pten::SelectedRows& input1, framework::Tensor* input2) { // NOTE: Use SelectedRowsAddToTensor for better performance // no additional MergeAdd called. diff --git a/paddle/fluid/operators/math/selected_rows_functor.h b/paddle/fluid/operators/math/selected_rows_functor.h index 8ba7851d7b979a..690082036c5e0a 100644 --- a/paddle/fluid/operators/math/selected_rows_functor.h +++ b/paddle/fluid/operators/math/selected_rows_functor.h @@ -35,15 +35,14 @@ namespace math { template struct SelectedRowsAdd { void operator()(const DeviceContext& context, - const framework::SelectedRows& input1, - const framework::SelectedRows& input2, - framework::SelectedRows* output); + const pten::SelectedRows& input1, + const pten::SelectedRows& input2, pten::SelectedRows* output); }; template struct SelectedRowsAddTensor { void operator()(const DeviceContext& context, - const framework::SelectedRows& input1, + const pten::SelectedRows& input1, const framework::Tensor& input2, framework::Tensor* output); }; @@ -51,17 +50,17 @@ struct SelectedRowsAddTensor { template struct SelectedRowsAddTo { void operator()(const DeviceContext& context, - const framework::SelectedRows& input1, - const int64_t input2_offset, framework::SelectedRows* input2); + const pten::SelectedRows& input1, const int64_t input2_offset, + pten::SelectedRows* input2); }; // input2 = [all input in input1] + input2 template struct SelectedRowsSumTo { void operator()(const DeviceContext& context, - const std::vector& input1, + const std::vector& input1, const std::vector& input2_offsets, - framework::SelectedRows* input2); + pten::SelectedRows* input2); }; // FIXME: The result of SelectedRowsAddToTensor maybe non deterministic, @@ -70,8 +69,7 @@ struct SelectedRowsSumTo { template struct SelectedRowsAddToTensor { void operator()(const DeviceContext& context, - const framework::SelectedRows& input1, - framework::Tensor* input2); + const pten::SelectedRows& input1, framework::Tensor* input2); }; namespace scatter { @@ -80,29 +78,25 @@ template struct MergeAdd { // unary functor, merge by adding duplicated rows in // the input SelectedRows object. - framework::SelectedRows operator()(const DeviceContext& context, - const framework::SelectedRows& input, - const bool sorted_result = false); + pten::SelectedRows operator()(const DeviceContext& context, + const pten::SelectedRows& input, + const bool sorted_result = false); + void operator()(const DeviceContext& context, const pten::SelectedRows& input, + pten::SelectedRows* output, const bool sorted_result = false); void operator()(const DeviceContext& context, - const framework::SelectedRows& input, - framework::SelectedRows* output, - const bool sorted_result = false); - void operator()(const DeviceContext& context, - const std::vector& inputs, - framework::SelectedRows* output, - const bool sorted_result = false); + const std::vector& inputs, + pten::SelectedRows* output, const bool sorted_result = false); }; template struct MergeAverage { - framework::SelectedRows operator()(const DeviceContext& context, - const framework::SelectedRows& input); - void operator()(const DeviceContext& context, - const framework::SelectedRows& input, - framework::SelectedRows* output); + pten::SelectedRows operator()(const DeviceContext& context, + const pten::SelectedRows& input); + void operator()(const DeviceContext& context, const pten::SelectedRows& input, + pten::SelectedRows* output); void operator()(const DeviceContext& context, - const std::vector& inputs, - framework::SelectedRows* output); + const std::vector& inputs, + pten::SelectedRows* output); }; enum class ScatterOps { ASSIGN, ADD, SUB, SUBBY, MUL, DIV, DIVBY }; @@ -111,8 +105,7 @@ enum class ScatterOps { ASSIGN, ADD, SUB, SUBBY, MUL, DIV, DIVBY }; template struct UpdateToTensor { void operator()(const DeviceContext& context, const ScatterOps& op, - const framework::SelectedRows& input1, - framework::Tensor* input2); + const pten::SelectedRows& input1, framework::Tensor* input2); }; } // namespace scatter diff --git a/paddle/fluid/operators/math/selected_rows_functor_test.cc b/paddle/fluid/operators/math/selected_rows_functor_test.cc index e0b36816490662..19e70f924f15e7 100644 --- a/paddle/fluid/operators/math/selected_rows_functor_test.cc +++ b/paddle/fluid/operators/math/selected_rows_functor_test.cc @@ -27,8 +27,8 @@ TEST(selected_rows_functor, cpu_add) { int64_t row_numel = 10; std::vector rows1{0, 4, 7}; - std::unique_ptr selected_rows1{ - new paddle::framework::SelectedRows(rows1, height)}; + std::unique_ptr selected_rows1{ + new pten::SelectedRows(rows1, height)}; auto* in1_value = selected_rows1->mutable_value(); in1_value->mutable_data( paddle::framework::make_ddim( @@ -37,8 +37,8 @@ TEST(selected_rows_functor, cpu_add) { functor(ctx, in1_value, 1.0); std::vector rows2{0, 5, 7, 9}; - std::unique_ptr selected_rows2{ - new paddle::framework::SelectedRows(rows2, height)}; + std::unique_ptr selected_rows2{ + new pten::SelectedRows(rows2, height)}; auto* in2_value = selected_rows2->mutable_value(); in2_value->mutable_data( paddle::framework::make_ddim( @@ -46,8 +46,7 @@ TEST(selected_rows_functor, cpu_add) { cpu_place); functor(ctx, in2_value, 2.0); - std::unique_ptr output{ - new paddle::framework::SelectedRows()}; + std::unique_ptr output{new pten::SelectedRows()}; auto* out_value = output->mutable_value(); // simplely concat two SelectedRows @@ -130,8 +129,8 @@ TEST(selected_rows_functor, cpu_add_to) { int64_t row_numel = 10; std::vector rows1{0, 4, 7}; - std::unique_ptr selected_rows1{ - new paddle::framework::SelectedRows(rows1, height)}; + std::unique_ptr selected_rows1{ + new pten::SelectedRows(rows1, height)}; auto* in1_value = selected_rows1->mutable_value(); in1_value->mutable_data( paddle::framework::make_ddim( @@ -140,8 +139,8 @@ TEST(selected_rows_functor, cpu_add_to) { functor(ctx, in1_value, 1.0); std::vector rows2{0, 5, 7, 9}; - std::unique_ptr selected_rows2{ - new paddle::framework::SelectedRows(rows2, height)}; + std::unique_ptr selected_rows2{ + new pten::SelectedRows(rows2, height)}; auto* in2_value = selected_rows2->mutable_value(); in2_value->mutable_data( paddle::framework::make_ddim( @@ -149,8 +148,7 @@ TEST(selected_rows_functor, cpu_add_to) { cpu_place); functor(ctx, in2_value, 2.0); - std::unique_ptr output{ - new paddle::framework::SelectedRows()}; + std::unique_ptr output{new pten::SelectedRows()}; output->set_height(height); auto* out_value = output->mutable_value(); @@ -230,8 +228,8 @@ TEST(selected_rows_functor, cpu_merge_average_float) { int64_t row_numel = 10; std::vector rows{0, 4, 4, 7}; - std::unique_ptr selected_rows{ - new paddle::framework::SelectedRows(rows, height)}; + std::unique_ptr selected_rows{ + new pten::SelectedRows(rows, height)}; auto* in_value = selected_rows->mutable_value(); in_value->mutable_data( paddle::framework::make_ddim( @@ -242,8 +240,7 @@ TEST(selected_rows_functor, cpu_merge_average_float) { paddle::operators::math::scatter::MergeAverage< paddle::platform::CPUDeviceContext, float> merge_average_functor; - paddle::framework::SelectedRows output = - merge_average_functor(ctx, *selected_rows); + pten::SelectedRows output = merge_average_functor(ctx, *selected_rows); auto out_height = output.height(); EXPECT_EQ(out_height, height); @@ -270,8 +267,8 @@ TEST(selected_rows_functor, cpu_merge_add_float) { int64_t row_numel = 10; std::vector rows{0, 4, 4, 7}; - std::unique_ptr selected_rows{ - new paddle::framework::SelectedRows(rows, height)}; + std::unique_ptr selected_rows{ + new pten::SelectedRows(rows, height)}; auto* in_value = selected_rows->mutable_value(); in_value->mutable_data( paddle::framework::make_ddim( @@ -279,8 +276,7 @@ TEST(selected_rows_functor, cpu_merge_add_float) { cpu_place); functor(ctx, in_value, 1.0); - std::unique_ptr output{ - new paddle::framework::SelectedRows()}; + std::unique_ptr output{new pten::SelectedRows()}; paddle::operators::math::scatter::MergeAdd @@ -311,8 +307,8 @@ TEST(selected_rows_functor, cpu_merge_add_int) { int64_t row_numel = 10; std::vector rows{0, 4, 4, 7}; - std::unique_ptr selected_rows{ - new paddle::framework::SelectedRows(rows, height)}; + std::unique_ptr selected_rows{ + new pten::SelectedRows(rows, height)}; auto* in_value = selected_rows->mutable_value(); in_value->mutable_data( paddle::framework::make_ddim( @@ -320,8 +316,7 @@ TEST(selected_rows_functor, cpu_merge_add_int) { cpu_place); functor(ctx, in_value, 1); - std::unique_ptr output{ - new paddle::framework::SelectedRows()}; + std::unique_ptr output{new pten::SelectedRows()}; paddle::operators::math::scatter::MergeAdd @@ -354,8 +349,8 @@ TEST(selected_rows_functor, cpu_merge_add_multi) { int64_t row_numel = 8; std::vector rows1{5, 2, 5, 3, 5}; - std::unique_ptr selected_rows1{ - new paddle::framework::SelectedRows(rows1, height)}; + std::unique_ptr selected_rows1{ + new pten::SelectedRows(rows1, height)}; auto* in1_value = selected_rows1->mutable_value(); in1_value->mutable_data( paddle::framework::make_ddim( @@ -364,8 +359,8 @@ TEST(selected_rows_functor, cpu_merge_add_multi) { set_const(ctx, in1_value, 1.0); std::vector rows2{2, 5, 3, 5, 3}; - std::unique_ptr selected_rows2{ - new paddle::framework::SelectedRows(rows2, height)}; + std::unique_ptr selected_rows2{ + new pten::SelectedRows(rows2, height)}; auto* in2_value = selected_rows2->mutable_value(); in2_value->mutable_data( paddle::framework::make_ddim( @@ -373,14 +368,13 @@ TEST(selected_rows_functor, cpu_merge_add_multi) { cpu_place); set_const(ctx, in2_value, 1.0); - std::unique_ptr output{ - new paddle::framework::SelectedRows()}; + std::unique_ptr output{new pten::SelectedRows()}; output->set_height(height); paddle::operators::math::scatter::MergeAdd merge_add_functor; - std::vector inputs; + std::vector inputs; inputs.push_back(selected_rows1.get()); inputs.push_back(selected_rows2.get()); merge_add_functor(ctx, inputs, output.get()); @@ -411,8 +405,8 @@ TEST(selected_rows_functor, cpu_merge_add_multi_noduplicated) { int64_t row_numel = 8; std::vector rows1{1, 3, 5, 7, 9}; - std::unique_ptr selected_rows1{ - new paddle::framework::SelectedRows(rows1, height)}; + std::unique_ptr selected_rows1{ + new pten::SelectedRows(rows1, height)}; auto* in1_value = selected_rows1->mutable_value(); in1_value->mutable_data( paddle::framework::make_ddim( @@ -421,8 +415,8 @@ TEST(selected_rows_functor, cpu_merge_add_multi_noduplicated) { set_const(ctx, in1_value, 1.0); std::vector rows2{0, 2, 4, 6, 8}; - std::unique_ptr selected_rows2{ - new paddle::framework::SelectedRows(rows2, height)}; + std::unique_ptr selected_rows2{ + new pten::SelectedRows(rows2, height)}; auto* in2_value = selected_rows2->mutable_value(); in2_value->mutable_data( paddle::framework::make_ddim( @@ -430,14 +424,13 @@ TEST(selected_rows_functor, cpu_merge_add_multi_noduplicated) { cpu_place); set_const(ctx, in2_value, 2.0); - std::unique_ptr output{ - new paddle::framework::SelectedRows()}; + std::unique_ptr output{new pten::SelectedRows()}; output->set_height(height); paddle::operators::math::scatter::MergeAdd merge_add_functor; - std::vector inputs; + std::vector inputs; inputs.push_back(selected_rows1.get()); inputs.push_back(selected_rows2.get()); merge_add_functor(ctx, inputs, output.get()); @@ -472,8 +465,8 @@ TEST(selected_rows_functor, cpu_sum_to) { int64_t height = 10; int64_t row_numel = 10; std::vector rows1{0, 4, 7}; - std::unique_ptr selected_rows1{ - new paddle::framework::SelectedRows(rows1, height)}; + std::unique_ptr selected_rows1{ + new pten::SelectedRows(rows1, height)}; auto* in1_value = selected_rows1->mutable_value(); in1_value->mutable_data( paddle::framework::make_ddim( @@ -482,8 +475,8 @@ TEST(selected_rows_functor, cpu_sum_to) { functor(ctx, in1_value, 1.0); std::vector rows2{0, 5, 7, 9}; - std::unique_ptr selected_rows2{ - new paddle::framework::SelectedRows(rows2, height)}; + std::unique_ptr selected_rows2{ + new pten::SelectedRows(rows2, height)}; auto* in2_value = selected_rows2->mutable_value(); in2_value->mutable_data( paddle::framework::make_ddim( @@ -491,8 +484,7 @@ TEST(selected_rows_functor, cpu_sum_to) { cpu_place); functor(ctx, in2_value, 2.0); - std::unique_ptr output{ - new paddle::framework::SelectedRows()}; + std::unique_ptr output{new pten::SelectedRows()}; output->set_height(height); auto* out_value = output->mutable_value(); // simplely concat two SelectedRows @@ -501,7 +493,7 @@ TEST(selected_rows_functor, cpu_sum_to) { paddle::operators::math::SelectedRowsSumTo sum_to_functor; - sum_to_functor(ctx, std::vector( + sum_to_functor(ctx, std::vector( {selected_rows1.get(), selected_rows2.get()}), std::vector({0, in1_value->numel()}), output.get()); auto out_height = output->height(); diff --git a/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc b/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc index ebcd97b32c4a30..e826c2a7244f71 100644 --- a/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc +++ b/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc @@ -29,8 +29,8 @@ TEST(selected_rows_functor, gpu_add) { int64_t row_numel = 10; std::vector rows1{0, 4, 7}; - std::unique_ptr selected_rows1{ - new paddle::framework::SelectedRows(rows1, height)}; + std::unique_ptr selected_rows1{ + new pten::SelectedRows(rows1, height)}; auto* in1_value = selected_rows1->mutable_value(); in1_value->mutable_data( paddle::framework::make_ddim( @@ -48,8 +48,8 @@ TEST(selected_rows_functor, gpu_add) { #endif std::vector rows2{0, 5, 7, 9}; - std::unique_ptr selected_rows2{ - new paddle::framework::SelectedRows(rows2, height)}; + std::unique_ptr selected_rows2{ + new pten::SelectedRows(rows2, height)}; auto* in2_value = selected_rows2->mutable_value(); in2_value->mutable_data( paddle::framework::make_ddim( @@ -57,8 +57,7 @@ TEST(selected_rows_functor, gpu_add) { gpu_place); functor(ctx, in2_value, 2.0); - std::unique_ptr output{ - new paddle::framework::SelectedRows()}; + std::unique_ptr output{new pten::SelectedRows()}; auto* out_value = output->mutable_value(); // simply concat two SelectedRows @@ -152,8 +151,8 @@ TEST(selected_rows_functor, gpu_add_to) { int64_t row_numel = 10; std::vector rows1{0, 4, 7}; - std::unique_ptr selected_rows1{ - new paddle::framework::SelectedRows(rows1, height)}; + std::unique_ptr selected_rows1{ + new pten::SelectedRows(rows1, height)}; auto* in1_value = selected_rows1->mutable_value(); in1_value->mutable_data( paddle::framework::make_ddim( @@ -162,8 +161,8 @@ TEST(selected_rows_functor, gpu_add_to) { functor(ctx, in1_value, 1.0); std::vector rows2{0, 5, 7, 9}; - std::unique_ptr selected_rows2{ - new paddle::framework::SelectedRows(rows2, height)}; + std::unique_ptr selected_rows2{ + new pten::SelectedRows(rows2, height)}; auto* in2_value = selected_rows2->mutable_value(); in2_value->mutable_data( paddle::framework::make_ddim( @@ -171,8 +170,7 @@ TEST(selected_rows_functor, gpu_add_to) { gpu_place); functor(ctx, in2_value, 2.0); - std::unique_ptr output{ - new paddle::framework::SelectedRows()}; + std::unique_ptr output{new pten::SelectedRows()}; output->set_height(height); auto* out_value = output->mutable_value(); @@ -264,8 +262,8 @@ TEST(selected_rows_functor, gpu_merge_add) { int64_t row_numel = 8; std::vector rows1{5, 2, 5, 3, 5}; - std::unique_ptr selected_rows1{ - new paddle::framework::SelectedRows(rows1, height)}; + std::unique_ptr selected_rows1{ + new pten::SelectedRows(rows1, height)}; auto* in1_value = selected_rows1->mutable_value(); in1_value->mutable_data( paddle::framework::make_ddim( @@ -274,8 +272,8 @@ TEST(selected_rows_functor, gpu_merge_add) { set_const(ctx, in1_value, 1.0); std::vector rows2{2, 5, 3, 5, 3}; - std::unique_ptr selected_rows2{ - new paddle::framework::SelectedRows(rows2, height)}; + std::unique_ptr selected_rows2{ + new pten::SelectedRows(rows2, height)}; auto* in2_value = selected_rows2->mutable_value(); in2_value->mutable_data( paddle::framework::make_ddim( @@ -283,14 +281,13 @@ TEST(selected_rows_functor, gpu_merge_add) { gpu_place); set_const(ctx, in2_value, 1.0); - std::unique_ptr output{ - new paddle::framework::SelectedRows()}; + std::unique_ptr output{new pten::SelectedRows()}; output->set_height(height); paddle::operators::math::scatter::MergeAdd< paddle::platform::CUDADeviceContext, float> merge_add_functor; - std::vector inputs; + std::vector inputs; inputs.push_back(selected_rows1.get()); inputs.push_back(selected_rows2.get()); merge_add_functor(ctx, inputs, output.get()); diff --git a/paddle/fluid/operators/memcpy_d2h_op.h b/paddle/fluid/operators/memcpy_d2h_op.h index e1b81c0c59241a..bdedb8e7d29458 100644 --- a/paddle/fluid/operators/memcpy_d2h_op.h +++ b/paddle/fluid/operators/memcpy_d2h_op.h @@ -51,7 +51,7 @@ class MemcpyD2HFunctor { } } - void operator()(const framework::SelectedRows &rows) const { + void operator()(const pten::SelectedRows &rows) const { // (JZ-LIANG) to support SelectedRows PADDLE_THROW(platform::errors::Unimplemented( "Memcpy for SelectedRows is NOT support yet.")); diff --git a/paddle/fluid/operators/memcpy_h2d_op.h b/paddle/fluid/operators/memcpy_h2d_op.h index 7f487001040307..c9995eeca16cd4 100644 --- a/paddle/fluid/operators/memcpy_h2d_op.h +++ b/paddle/fluid/operators/memcpy_h2d_op.h @@ -59,7 +59,7 @@ class MemcpyH2DFunctor { out_tensor.set_lod(lod_tensor.lod()); } - void operator()(const framework::SelectedRows &rows) const { + void operator()(const pten::SelectedRows &rows) const { // (JZ-LIANG) to support SelectedRows PADDLE_THROW(platform::errors::Unimplemented( "Memcpy for SelectedRows is NOT support yet.")); diff --git a/paddle/fluid/operators/memcpy_op.h b/paddle/fluid/operators/memcpy_op.h index ac4a0d1ab111ed..40c7aceda51160 100644 --- a/paddle/fluid/operators/memcpy_op.h +++ b/paddle/fluid/operators/memcpy_op.h @@ -75,7 +75,7 @@ class MemcpyFunctor { out_tensor.set_lod(lod_tensor.lod()); } - void operator()(const framework::SelectedRows &rows) const { + void operator()(const pten::SelectedRows &rows) const { // (JZ-LIANG) to support SelectedRows PADDLE_THROW(platform::errors::Unimplemented( "Memcpy for SelectedRows is NOT support yet.")); diff --git a/paddle/fluid/operators/merge_selected_rows_op.h b/paddle/fluid/operators/merge_selected_rows_op.h index 4c977e94b175c9..0fe262dea3b135 100644 --- a/paddle/fluid/operators/merge_selected_rows_op.h +++ b/paddle/fluid/operators/merge_selected_rows_op.h @@ -24,8 +24,8 @@ template class MergeSelectedRowsKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { - auto* x = context.Input("X"); - auto* out = context.Output("Out"); + auto* x = context.Input("X"); + auto* out = context.Output("Out"); math::scatter::MergeAdd merge_func; merge_func(context.template device_context(), *x, out); diff --git a/paddle/fluid/operators/nce_op.h b/paddle/fluid/operators/nce_op.h index 55f684b66485bb..edd2ae4ca9c87c 100644 --- a/paddle/fluid/operators/nce_op.h +++ b/paddle/fluid/operators/nce_op.h @@ -31,7 +31,7 @@ namespace operators { using Tensor = framework::Tensor; using LoDTensor = framework::LoDTensor; -using SelectedRows = framework::SelectedRows; +using SelectedRows = pten::SelectedRows; using Sampler = math::Sampler; using DDim = framework::DDim; @@ -364,8 +364,8 @@ class NCEGradKernel : public framework::OpKernel { DDim table_dim; if (table_var->IsType()) { table_dim = context.Input("Weight")->dims(); - } else if (table_var->IsType()) { - auto *table_t = context.Input("Weight"); + } else if (table_var->IsType()) { + auto *table_t = context.Input("Weight"); table_dim = table_t->value().dims(); } else { PADDLE_THROW(platform::errors::InvalidArgument( @@ -373,7 +373,8 @@ class NCEGradKernel : public framework::OpKernel { "must be either LoDTensor or SelectedRows")); } - auto d_w = context.Output(framework::GradVarName("Weight")); + auto d_w = + context.Output(framework::GradVarName("Weight")); d_w->set_rows(labels); d_w->set_height(table_dim[0]); diff --git a/paddle/fluid/operators/optimizers/adagrad_op.cc b/paddle/fluid/operators/optimizers/adagrad_op.cc index 255dc5bb083114..31d3e1208dadb7 100644 --- a/paddle/fluid/operators/optimizers/adagrad_op.cc +++ b/paddle/fluid/operators/optimizers/adagrad_op.cc @@ -111,7 +111,7 @@ size_t FindPos(const std::vector& rows, int64_t value) { template struct SparseAdagradFunctor { void operator()(const platform::CPUDeviceContext& context, - const framework::SelectedRows& grad, + const pten::SelectedRows& grad, const framework::Tensor& learning_rate, T epsilon, framework::Tensor* moment, framework::Tensor* param) { // 1. g_m.rows = set(g.rows) diff --git a/paddle/fluid/operators/optimizers/adagrad_op.cu b/paddle/fluid/operators/optimizers/adagrad_op.cu index 8b939b7c6b3ba2..a7c32255bd1ee0 100644 --- a/paddle/fluid/operators/optimizers/adagrad_op.cu +++ b/paddle/fluid/operators/optimizers/adagrad_op.cu @@ -72,7 +72,7 @@ __global__ void SparseAdagradFunctorKernel(const T* grad, const int64_t* rows, template struct SparseAdagradFunctor { void operator()(const platform::CUDADeviceContext& context, - const framework::SelectedRows& grad, + const pten::SelectedRows& grad, const framework::Tensor& learning_rate, T epsilon, framework::Tensor* moment, framework::Tensor* param) { // 1. g_m.rows = set(g.rows) diff --git a/paddle/fluid/operators/optimizers/adagrad_op.h b/paddle/fluid/operators/optimizers/adagrad_op.h index 057bd4e863ddf7..c2dc3f095ed99d 100644 --- a/paddle/fluid/operators/optimizers/adagrad_op.h +++ b/paddle/fluid/operators/optimizers/adagrad_op.h @@ -22,16 +22,15 @@ namespace operators { template struct SparseAdagradFunctor { - void operator()(const DeviceContext &context, - const framework::SelectedRows &grad, + void operator()(const DeviceContext &context, const pten::SelectedRows &grad, const framework::Tensor &learning_rate, T epsilon, framework::Tensor *moment, framework::Tensor *param); }; template -framework::SelectedRows SquareSelectedRows( - const DeviceContext &context, const framework::SelectedRows &input) { - framework::SelectedRows out; +pten::SelectedRows SquareSelectedRows(const DeviceContext &context, + const pten::SelectedRows &input) { + pten::SelectedRows out; out.set_rows(input.rows()); out.set_height(input.height()); out.mutable_value()->mutable_data(input.value().dims(), @@ -88,7 +87,7 @@ class AdagradOpKernel : public framework::OpKernel { param - lr.broadcast(m_dsize) * grad / (moment_out.sqrt() + epsilon); } - } else if (grad_var->IsType()) { + } else if (grad_var->IsType()) { auto *param_tensor = ctx.Input("Param"); PADDLE_ENFORCE_EQ(param_tensor, param_out_tensor, platform::errors::InvalidArgument( @@ -101,7 +100,7 @@ class AdagradOpKernel : public framework::OpKernel { SparseAdagradFunctor functor; functor(ctx.template device_context(), - *ctx.Input("Grad"), + *ctx.Input("Grad"), *ctx.Input("LearningRate"), epsilon, moment_out_tensor, param_out_tensor); } else { diff --git a/paddle/fluid/operators/optimizers/adam_op.cu b/paddle/fluid/operators/optimizers/adam_op.cu index 1ef46ef085c5d7..c7ffb53a058826 100644 --- a/paddle/fluid/operators/optimizers/adam_op.cu +++ b/paddle/fluid/operators/optimizers/adam_op.cu @@ -314,8 +314,8 @@ class AdamOpCUDAKernel : public framework::OpKernel { beta2_pow_out->mutable_data(ctx.GetPlace())); } } - } else if (grad_var->IsType()) { - auto* grad = ctx.Input("Grad"); + } else if (grad_var->IsType()) { + auto* grad = ctx.Input("Grad"); if (grad->rows().size() == 0) { VLOG(3) << "grad row size is 0!!"; return; @@ -330,8 +330,8 @@ class AdamOpCUDAKernel : public framework::OpKernel { } } - framework::SelectedRows tmp_grad_merge; - const framework::SelectedRows* grad_merge_ptr; + pten::SelectedRows tmp_grad_merge; + const pten::SelectedRows* grad_merge_ptr; if (is_strict_sorted) { grad_merge_ptr = grad; } else { diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h index bb044b4b4986e3..bcc314cd57c017 100644 --- a/paddle/fluid/operators/optimizers/adam_op.h +++ b/paddle/fluid/operators/optimizers/adam_op.h @@ -521,8 +521,8 @@ class AdamOpKernel : public framework::OpKernel { beta2_pow_out->mutable_data(ctx.GetPlace())[0] = beta2 * beta2_pow->data()[0]; } - } else if (grad_var->IsType()) { - auto* grad = ctx.Input("Grad"); + } else if (grad_var->IsType()) { + auto* grad = ctx.Input("Grad"); if (grad->rows().size() == 0) { VLOG(3) << "grad row size is 0!!"; return; @@ -537,8 +537,8 @@ class AdamOpKernel : public framework::OpKernel { } } - framework::SelectedRows tmp_grad_merge; - const framework::SelectedRows* grad_merge_ptr; + pten::SelectedRows tmp_grad_merge; + const pten::SelectedRows* grad_merge_ptr; if (is_strict_sorted) { grad_merge_ptr = grad; } else { diff --git a/paddle/fluid/operators/optimizers/adam_op_xpu.cc b/paddle/fluid/operators/optimizers/adam_op_xpu.cc index e462c20c7f51db..fd83b76e02a24f 100644 --- a/paddle/fluid/operators/optimizers/adam_op_xpu.cc +++ b/paddle/fluid/operators/optimizers/adam_op_xpu.cc @@ -195,8 +195,8 @@ class AdamOpXPUKernel : public framework::OpKernel { xpu_wait(dev_ctx.x_context()->xpu_stream); } } - } else if (grad_var->IsType()) { - auto* grad = ctx.Input("Grad"); + } else if (grad_var->IsType()) { + auto* grad = ctx.Input("Grad"); auto& dev_ctx = ctx.template device_context(); if (grad->rows().size() == 0) { @@ -213,8 +213,8 @@ class AdamOpXPUKernel : public framework::OpKernel { } } - framework::SelectedRows tmp_grad_merge; - const framework::SelectedRows* grad_merge_ptr; + pten::SelectedRows tmp_grad_merge; + const pten::SelectedRows* grad_merge_ptr; if (is_strict_sorted) { grad_merge_ptr = grad; } else { diff --git a/paddle/fluid/operators/optimizers/adamw_op.cu b/paddle/fluid/operators/optimizers/adamw_op.cu index a8b16e73dbfffe..8bce415cb1ab98 100644 --- a/paddle/fluid/operators/optimizers/adamw_op.cu +++ b/paddle/fluid/operators/optimizers/adamw_op.cu @@ -337,8 +337,8 @@ class AdamWOpCUDAKernel : public framework::OpKernel { beta2_pow_out->mutable_data(ctx.GetPlace())); } } - } else if (grad_var->IsType()) { - auto* grad = ctx.Input("Grad"); + } else if (grad_var->IsType()) { + auto* grad = ctx.Input("Grad"); if (grad->rows().size() == 0) { VLOG(3) << "grad row size is 0!!"; return; @@ -353,8 +353,8 @@ class AdamWOpCUDAKernel : public framework::OpKernel { } } - framework::SelectedRows tmp_grad_merge; - const framework::SelectedRows* grad_merge_ptr; + pten::SelectedRows tmp_grad_merge; + const pten::SelectedRows* grad_merge_ptr; if (is_strict_sorted) { grad_merge_ptr = grad; } else { diff --git a/paddle/fluid/operators/optimizers/ftrl_op.h b/paddle/fluid/operators/optimizers/ftrl_op.h index 6bf8c8d724fb89..9c9355921d8273 100644 --- a/paddle/fluid/operators/optimizers/ftrl_op.h +++ b/paddle/fluid/operators/optimizers/ftrl_op.h @@ -180,11 +180,11 @@ class FTRLOpKernel : public framework::OpKernel { } s_acc_out.device(place) = sq_accum + g * g; - } else if (grad_var->IsType()) { - auto grad = ctx.Input("Grad"); + } else if (grad_var->IsType()) { + auto grad = ctx.Input("Grad"); - framework::SelectedRows tmp_merged_grad; - framework::SelectedRows* merged_grad = &tmp_merged_grad; + pten::SelectedRows tmp_merged_grad; + pten::SelectedRows* merged_grad = &tmp_merged_grad; math::scatter::MergeAdd merge_func; merge_func(ctx.template device_context(), *grad, merged_grad); diff --git a/paddle/fluid/operators/optimizers/lamb_op.h b/paddle/fluid/operators/optimizers/lamb_op.h index 9a3eaa66caa8e8..f1158703f028b6 100644 --- a/paddle/fluid/operators/optimizers/lamb_op.h +++ b/paddle/fluid/operators/optimizers/lamb_op.h @@ -552,7 +552,7 @@ class LambOpKernel : public framework::OpKernel { trust_ratio_div_ptr, skip_update_flag); for_range(moment_update_functor); } - } else if (grad_var->IsType()) { + } else if (grad_var->IsType()) { PADDLE_ENFORCE_EQ(IsMultiPrecision, false, platform::errors::Unimplemented( "SelectedRows gradient is not supported when " @@ -562,7 +562,7 @@ class LambOpKernel : public framework::OpKernel { platform::errors::Unimplemented( "SelectedRows gradient is not supported when " "multi_precision=True.")); - auto& grad = GET_DATA_SAFELY(ctx.Input("Grad"), + auto& grad = GET_DATA_SAFELY(ctx.Input("Grad"), "Input", "Grad", "Lamb"); if (grad.rows().size() == 0) { VLOG(3) << "grad row size is 0!!"; @@ -578,8 +578,8 @@ class LambOpKernel : public framework::OpKernel { } } - framework::SelectedRows tmp_grad_merge; - const framework::SelectedRows* grad_merge_ptr; + pten::SelectedRows tmp_grad_merge; + const pten::SelectedRows* grad_merge_ptr; if (is_strict_sorted) { grad_merge_ptr = &grad; } else { diff --git a/paddle/fluid/operators/optimizers/mkldnn/sgd_mkldnn_op.cc b/paddle/fluid/operators/optimizers/mkldnn/sgd_mkldnn_op.cc index 450ef376799d3d..ee3111c7dd6a09 100644 --- a/paddle/fluid/operators/optimizers/mkldnn/sgd_mkldnn_op.cc +++ b/paddle/fluid/operators/optimizers/mkldnn/sgd_mkldnn_op.cc @@ -48,7 +48,7 @@ class SGDOneDNNKernel : public SGDOpKernel { VLOG(4) << "[ONEDNN]: sgd_dense_param_kernel"; const auto *learning_rate = ctx.Input("LearningRate"); auto *param_out = ctx.Output("ParamOut"); - const auto *grad = ctx.Input("Grad"); + const auto *grad = ctx.Input("Grad"); const auto &grad_value = grad->value(); const auto &grad_rows = grad->rows(); diff --git a/paddle/fluid/operators/optimizers/momentum_op.h b/paddle/fluid/operators/optimizers/momentum_op.h index 2d713308fd9389..79d76d52f48c8c 100644 --- a/paddle/fluid/operators/optimizers/momentum_op.h +++ b/paddle/fluid/operators/optimizers/momentum_op.h @@ -27,7 +27,7 @@ namespace paddle { namespace operators { using framework::Tensor; -using framework::SelectedRows; +using pten::SelectedRows; struct NoNesterov; struct UseNesterov; @@ -545,9 +545,9 @@ class MomentumOpKernel : public framework::OpKernel { } } - } else if (grad_var->IsType()) { + } else if (grad_var->IsType()) { // sparse update embedding with selectedrows - auto grad = ctx.Input("Grad"); + auto grad = ctx.Input("Grad"); // sparse update maybe empty. if (grad->rows().size() == 0) { @@ -555,8 +555,8 @@ class MomentumOpKernel : public framework::OpKernel { return; } - framework::SelectedRows tmp_merged_grad; - framework::SelectedRows* merged_grad = &tmp_merged_grad; + pten::SelectedRows tmp_merged_grad; + pten::SelectedRows* merged_grad = &tmp_merged_grad; math::scatter::MergeAdd merge_func; merge_func(ctx.template device_context(), *grad, merged_grad); diff --git a/paddle/fluid/operators/optimizers/momentum_op_npu.cc b/paddle/fluid/operators/optimizers/momentum_op_npu.cc index e3f0e5cc04d9ee..a71847c4690821 100644 --- a/paddle/fluid/operators/optimizers/momentum_op_npu.cc +++ b/paddle/fluid/operators/optimizers/momentum_op_npu.cc @@ -74,7 +74,7 @@ class NPUMomentumOpKernel : public framework::OpKernel { regularized_grad, mu_tensor}, {*param_out}, {{"use_nesterov", use_nesterov}}); runner.Run(dev_ctx.stream()); - } else if (grad_var->IsType()) { + } else if (grad_var->IsType()) { PADDLE_ENFORCE_EQ(false, true, platform::errors::PermissionDenied( "Unsupport SparseMomentum")); } else { diff --git a/paddle/fluid/operators/optimizers/rmsprop_op.h b/paddle/fluid/operators/optimizers/rmsprop_op.h index 9971cb92306a27..a01f84b37c4eb2 100644 --- a/paddle/fluid/operators/optimizers/rmsprop_op.h +++ b/paddle/fluid/operators/optimizers/rmsprop_op.h @@ -218,10 +218,10 @@ class RmspropOpKernel : public framework::OpKernel { rho, epsilon, momentum, grad_func)); } } - } else if (grad_var->IsType()) { - auto &grad = grad_var->Get(); - framework::SelectedRows tmp_merged_grad; - framework::SelectedRows *merged_grad = &tmp_merged_grad; + } else if (grad_var->IsType()) { + auto &grad = grad_var->Get(); + pten::SelectedRows tmp_merged_grad; + pten::SelectedRows *merged_grad = &tmp_merged_grad; math::scatter::MergeAdd merge_func; merge_func(dev_ctx, grad, merged_grad); diff --git a/paddle/fluid/operators/optimizers/sgd_op.cc b/paddle/fluid/operators/optimizers/sgd_op.cc index 28f73e0618c2ae..08c40e02b1702b 100644 --- a/paddle/fluid/operators/optimizers/sgd_op.cc +++ b/paddle/fluid/operators/optimizers/sgd_op.cc @@ -80,7 +80,7 @@ class SGDOp : public framework::OperatorWithKernel { // supported cases bool dense_param_sparse_grad = param_var->IsType() && - grad_var->IsType(); + grad_var->IsType(); bool dense_param_and_grad = param_var->IsType() && grad_var->IsType(); diff --git a/paddle/fluid/operators/optimizers/sgd_op.cu b/paddle/fluid/operators/optimizers/sgd_op.cu index 5e3ae6c017bcac..7ecd84f4ff16a3 100644 --- a/paddle/fluid/operators/optimizers/sgd_op.cu +++ b/paddle/fluid/operators/optimizers/sgd_op.cu @@ -112,7 +112,7 @@ class SGDOpKernel param->numel(), param_out->mutable_data(ctx.GetPlace()), master_in_data, master_out_data); - } else if (grad_var->IsType()) { + } else if (grad_var->IsType()) { // TODO(qijun): In Sparse SGD operator, in-place update is enforced. // This manual optimization brings difficulty to track data dependency. // It's better to find a more elegant solution. @@ -121,7 +121,7 @@ class SGDOpKernel platform::errors::InvalidArgument( "The input tensor Param of SgdOp should be equal with ParamOut " "if variable's type is SelectedRows.")); - auto* grad = ctx.Input("Grad"); + auto* grad = ctx.Input("Grad"); auto in_height = grad->height(); auto out_dims = param_out->dims(); diff --git a/paddle/fluid/operators/optimizers/sgd_op.h b/paddle/fluid/operators/optimizers/sgd_op.h index 9d98e745a01aec..7df6bbf410d2d7 100644 --- a/paddle/fluid/operators/optimizers/sgd_op.h +++ b/paddle/fluid/operators/optimizers/sgd_op.h @@ -60,13 +60,13 @@ struct sgd_dense_param_kernel< // SelectedRows template struct sgd_dense_param_kernel< - T, framework::VarTypeTrait::kId> { + T, framework::VarTypeTrait::kId> { void operator()(const framework::ExecutionContext &ctx) const { VLOG(4) << "[CPU]: sgd_dense_param_kernel"; const auto *learning_rate = ctx.Input("LearningRate"); const auto *param = ctx.Input("Param"); auto *param_out = ctx.Output("ParamOut"); - const auto *grad = ctx.Input("Grad"); + const auto *grad = ctx.Input("Grad"); const auto &grad_value = grad->value(); const auto &grad_rows = grad->rows(); @@ -114,12 +114,12 @@ struct sgd_dense_param_kernel< // SelectedRows template <> struct sgd_dense_param_kernel< - platform::bfloat16, framework::VarTypeTrait::kId> { + platform::bfloat16, framework::VarTypeTrait::kId> { void operator()(const framework::ExecutionContext &ctx) const { VLOG(4) << "[CPU]: sgd_dense_param_kernel"; const auto *learning_rate = ctx.Input("LearningRate"); auto *param_out = ctx.Output("ParamOut"); - const auto *grad = ctx.Input("Grad"); + const auto *grad = ctx.Input("Grad"); const auto &grad_value = grad->value(); const auto &grad_rows = grad->rows(); @@ -163,7 +163,7 @@ class SGDOpKernel if (param_var->IsType()) { invoke_dense_param_kernel(ctx); - } else if (param_var->IsType()) { + } else if (param_var->IsType()) { sparse_param_and_grad_kernel(ctx); } else { PADDLE_ENFORCE_EQ( @@ -200,7 +200,7 @@ class SGDOpKernel grad->numel(), sz)); dense_param_and_grad_kernel(ctx); - } else if (grad_var->IsType()) { + } else if (grad_var->IsType()) { // TODO(qijun): In Sparse SGD operator, in-place update is enforced. // This manual optimization brings difficulty to track data dependency. // It's better to find a more elegant solution. @@ -209,7 +209,7 @@ class SGDOpKernel "The input tensor Param of SgdOp " "should be equal with ParamOut if variable's " "type is SelectedRows. ")); - const auto *grad = ctx.Input("Grad"); + const auto *grad = ctx.Input("Grad"); // for distributed training, a sparse var may be empty, // just skip updating. @@ -259,13 +259,13 @@ class SGDOpKernel const auto *param_var = ctx.InputVar("Param"); const auto *grad_var = ctx.InputVar("Grad"); - PADDLE_ENFORCE_EQ(grad_var->IsType(), true, + PADDLE_ENFORCE_EQ(grad_var->IsType(), true, platform::errors::InvalidArgument( "When param is SelectedRows, gradient should also " "be SelectedRows")); - const auto ¶m = param_var->Get(); - auto *param_out = ctx.Output("ParamOut"); - const auto &grad = grad_var->Get(); + const auto ¶m = param_var->Get(); + auto *param_out = ctx.Output("ParamOut"); + const auto &grad = grad_var->Get(); // for distributed training, a sparse var may be empty, // just skip updating. @@ -309,7 +309,7 @@ class SGDOpKernel virtual void dense_param_sparse_grad_kernel( const framework::ExecutionContext &ctx) const { detail::sgd_dense_param_kernel< - T, framework::VarTypeTrait::kId>()(ctx); + T, framework::VarTypeTrait::kId>()(ctx); } }; diff --git a/paddle/fluid/operators/p_norm_op.cu b/paddle/fluid/operators/p_norm_op.cu index 88e94ba039ac27..c0bd906685d4d8 100644 --- a/paddle/fluid/operators/p_norm_op.cu +++ b/paddle/fluid/operators/p_norm_op.cu @@ -76,22 +76,13 @@ struct AbsFunctor { } }; -template +template struct UnsignedPowFunctor { HOSTDEVICE explicit inline UnsignedPowFunctor(float porder) { this->porder = porder; } - HOSTDEVICE inline Ty operator()(const Tx x) const { - return static_cast(inline_pow(inline_abs(x), static_cast(porder))); - } - float porder; -}; - -template -struct PowFunctor { - HOSTDEVICE explicit inline PowFunctor(float porder) { this->porder = porder; } - HOSTDEVICE inline Ty operator()(const Tx x) const { - return static_cast(inline_pow(x, static_cast(porder))); + HOSTDEVICE inline T operator()(const T x) const { + return static_cast(inline_pow(inline_abs(x), static_cast(porder))); } float porder; }; @@ -105,13 +96,11 @@ class PnormCUDAKernel : public framework::OpKernel { const T* x = in_x->data(); T* norm = out_norm->mutable_data(ctx.GetPlace()); auto xdim = in_x->dims(); - auto ndim = out_norm->dims(); float porder = ctx.Attr("porder"); bool asvector = ctx.Attr("asvector"); int axis = ctx.Attr("axis"); std::vector reduce_axis = {axis}; reduce_axis = GetReduceDim(reduce_axis, xdim.size(), asvector); - auto stream = ctx.cuda_device_context().stream(); using MT = typename details::MPTypeTrait::Type; @@ -125,29 +114,17 @@ class PnormCUDAKernel : public framework::OpKernel { TensorReduceFunctorImpl>( *in_x, out_norm, AbsFunctor(), reduce_axis, stream); } else { - framework::Tensor tmp_x; - tmp_x.mutable_data(xdim, ctx.GetPlace()); - std::vector ins = {in_x}; - std::vector outs = {&tmp_x}; - auto func = UnsignedPowFunctor(porder); + TensorReduceFunctorImpl>( + *in_x, out_norm, UnsignedPowFunctor(porder), reduce_axis, stream); + + const framework::Tensor* tmp_norm = out_norm; + std::vector ins = {tmp_norm}; + std::vector outs = {out_norm}; const auto& cuda_ctx = ctx.template device_context(); - - paddle::operators::LaunchSameDimsElementwiseCudaKernel< - ElementwiseType::kUnary, MT, T, UnsignedPowFunctor>( - cuda_ctx, ins, &outs, func); - framework::Tensor tmp_y; - tmp_y.mutable_data(ndim, ctx.GetPlace()); - TensorReduceFunctorImpl>( - tmp_x, &tmp_y, kps::IdentityFunctor(), reduce_axis, stream); - const framework::Tensor* tmp_norm = &tmp_y; - ins = {tmp_norm}; - outs = {out_norm}; - auto func_inverse = UnsignedPowFunctor(1. / porder); - paddle::operators::LaunchSameDimsElementwiseCudaKernel< - ElementwiseType::kUnary, MT, T, UnsignedPowFunctor>( - cuda_ctx, ins, &outs, func_inverse); + ElementwiseType::kUnary, T, T, UnsignedPowFunctor>( + cuda_ctx, ins, &outs, UnsignedPowFunctor(1. / porder)); } } }; @@ -158,29 +135,25 @@ struct AbsMaxAndMinGradFunctor { typename DY, typename Dim> void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy, const Dim& dim, int size) { - auto equals = ((*x).abs() == y->broadcast(dim)); - auto ones = dx->constant(static_cast(1.)); - auto negs = dx->constant(static_cast(-1.)); - auto zeros = dx->constant(static_cast(0.)); - auto positives = (*x) > zeros; - dx->device(place) = dy->broadcast(dim) * equals.select(ones, zeros) * - positives.select(ones, negs); + dx->device(place) = dy->broadcast(dim) * (*x).sign() * + ((*x).abs() == y->broadcast(dim)).template cast(); } }; template -struct PNormPostGradFunctor { +struct PNormGradFunctor { + HOSTDEVICE explicit inline PNormGradFunctor(float porder) { + this->porder = static_cast(porder - 1.); + } template void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy, const Dim& dim, int size) { - auto ones = dx->constant(static_cast(1.)); - auto negs = dx->constant(static_cast(-1.)); - auto zeros = dx->constant(static_cast(0.)); - auto positives = (*x) > zeros; - dx->device(place) = (*dx) * dy->broadcast(dim) * y->broadcast(dim) * - positives.select(ones, negs); + dx->device(place) = (*x).abs().pow(this->porder) * (*x).sign() * + dy->broadcast(dim) * + (*y).pow(-this->porder).broadcast(dim); } + T porder; }; template @@ -207,26 +180,13 @@ class PnormGradCUDAKernel : public framework::OpKernel { math::SetConstant set_zero; set_zero(cuda_ctx, out_dx, static_cast(0)); } else if (porder == INFINITY || porder == -INFINITY) { + AbsMaxAndMinGradFunctor functor; LaunchReduceGradKernel>( - ctx, in_x, in_norm, in_norm_dy, out_dx, dims, reduce_all); + ctx, in_x, in_norm, in_norm_dy, out_dx, functor, dims, reduce_all); } else { - framework::Tensor tmp_norm; - tmp_norm.mutable_data(in_norm->dims(), ctx.GetPlace()); - std::vector ins = {in_norm}; - std::vector outs = {&tmp_norm}; - auto pow_functor = PowFunctor(1. - porder); - paddle::operators::LaunchSameDimsElementwiseCudaKernel< - ElementwiseType::kUnary, T, T, PowFunctor>(cuda_ctx, ins, &outs, - pow_functor); - ins = {in_x}; - outs = {out_dx}; - auto unsigned_pow = UnsignedPowFunctor(porder - 1.); - paddle::operators::LaunchSameDimsElementwiseCudaKernel< - ElementwiseType::kUnary, T, T, UnsignedPowFunctor>( - cuda_ctx, ins, &outs, unsigned_pow); - const framework::Tensor* tmp_norm_const = &tmp_norm; - LaunchReduceGradKernel>( - ctx, in_x, tmp_norm_const, in_norm_dy, out_dx, dims, reduce_all); + auto functor = PNormGradFunctor(porder); + LaunchReduceGradKernel>( + ctx, in_x, in_norm, in_norm_dy, out_dx, functor, dims, reduce_all); } } }; diff --git a/paddle/fluid/operators/pscore/distributed_lookup_table_op.h b/paddle/fluid/operators/pscore/distributed_lookup_table_op.h index 292db60079e4cd..763f67360ed0ad 100644 --- a/paddle/fluid/operators/pscore/distributed_lookup_table_op.h +++ b/paddle/fluid/operators/pscore/distributed_lookup_table_op.h @@ -40,8 +40,8 @@ class DistributedLookupTableKernel : public framework::OpKernel { if (var->IsType()) { emb_dim = var->Get().dims()[1]; - } else if (var->IsType()) { - emb_dim = var->Get().value().dims()[1]; + } else if (var->IsType()) { + emb_dim = var->Get().value().dims()[1]; } else { PADDLE_THROW(platform::errors::InvalidArgument( "Expected type of `W` must be Tensor, SelectedRows.But got " diff --git a/paddle/fluid/operators/pscore/fake_init_op.cc b/paddle/fluid/operators/pscore/fake_init_op.cc index cb27dc75eb2faf..b3a745fc99538e 100644 --- a/paddle/fluid/operators/pscore/fake_init_op.cc +++ b/paddle/fluid/operators/pscore/fake_init_op.cc @@ -39,8 +39,8 @@ class FakeInitOp : public framework::OperatorBase { if (out_var.IsType()) { tensor = out_var.GetMutable(); tensor->Resize(framework::make_ddim(Attr>("shape"))); - } else if (out_var.IsType()) { - tensor = out_var.GetMutable()->mutable_value(); + } else if (out_var.IsType()) { + tensor = out_var.GetMutable()->mutable_value(); tensor->Resize(framework::make_ddim(Attr>("shape"))); } else { PADDLE_THROW(platform::errors::InvalidArgument( diff --git a/paddle/fluid/operators/pscore/heter_server_test.cc b/paddle/fluid/operators/pscore/heter_server_test.cc index 5029aa0ebdcc0c..a1e29ed3e3459d 100644 --- a/paddle/fluid/operators/pscore/heter_server_test.cc +++ b/paddle/fluid/operators/pscore/heter_server_test.cc @@ -52,7 +52,7 @@ framework::BlockDesc* AppendSendAndRecvBlock(framework::ProgramDesc* program) { void CreateVarsOnScope(framework::Scope* scope, platform::CPUPlace* place) { auto w_var = scope->Var("w"); - w_var->GetMutable(); + w_var->GetMutable(); auto out_var = scope->Var("out"); out_var->GetMutable(); @@ -123,7 +123,7 @@ void InitTensorsOnClient2(framework::Scope* scope, platform::CPUPlace* place, void InitTensorsOnServer(framework::Scope* scope, platform::CPUPlace* place, int64_t rows_numel) { CreateVarsOnScope(scope, place); - auto w = scope->Var("w")->GetMutable(); + auto w = scope->Var("w")->GetMutable(); auto w_value = w->mutable_value(); w_value->Resize({rows_numel, 10}); for (int64_t i = 0; i < rows_numel; ++i) w->AutoGrownIndex(i, true); diff --git a/paddle/fluid/operators/pscore/send_and_recv_op_cpu_test.cc b/paddle/fluid/operators/pscore/send_and_recv_op_cpu_test.cc index 6b1ab77b45d35d..c2e4407f662a29 100644 --- a/paddle/fluid/operators/pscore/send_and_recv_op_cpu_test.cc +++ b/paddle/fluid/operators/pscore/send_and_recv_op_cpu_test.cc @@ -55,7 +55,7 @@ framework::BlockDesc* AppendSendAndRecvBlock(framework::ProgramDesc* program) { void CreateVarsOnScope(framework::Scope* scope) { auto w_var = scope->Var("w"); - w_var->GetMutable(); + w_var->GetMutable(); auto out_var = scope->Var("out"); out_var->GetMutable(); @@ -76,7 +76,7 @@ void CreateVarsOnScope(framework::Scope* scope) { void InitTensorsOnServer(framework::Scope* scope, platform::CPUPlace* place, int64_t rows_numel) { CreateVarsOnScope(scope); - auto w = scope->Var("w")->GetMutable(); + auto w = scope->Var("w")->GetMutable(); auto w_value = w->mutable_value(); w_value->Resize({rows_numel, 10}); for (int64_t i = 0; i < rows_numel; ++i) w->AutoGrownIndex(i, true); diff --git a/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc b/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc index 3a361360e2ed7e..42eea54ce74a51 100644 --- a/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc +++ b/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc @@ -59,7 +59,7 @@ framework::BlockDesc* AppendSendAndRecvBlock(framework::ProgramDesc* program) { void CreateVarsOnScope(framework::Scope* scope) { auto w_var = scope->Var("w"); - w_var->GetMutable(); + w_var->GetMutable(); auto out_var = scope->Var("out"); out_var->GetMutable(); @@ -121,7 +121,7 @@ void InitTensorsOnClient(framework::Scope* scope, int64_t rows_numel, void InitTensorsOnServer(framework::Scope* scope, platform::CPUPlace* place, int64_t rows_numel) { CreateVarsOnScope(scope); - auto w = scope->Var("w")->GetMutable(); + auto w = scope->Var("w")->GetMutable(); auto w_value = w->mutable_value(); w_value->Resize({rows_numel, 10}); for (int64_t i = 0; i < rows_numel; ++i) w->AutoGrownIndex(i, true); diff --git a/paddle/fluid/operators/reduce_ops/logsumexp_op.h b/paddle/fluid/operators/reduce_ops/logsumexp_op.h index 06c9f23dd2c26f..4490f08b2129ad 100644 --- a/paddle/fluid/operators/reduce_ops/logsumexp_op.h +++ b/paddle/fluid/operators/reduce_ops/logsumexp_op.h @@ -139,26 +139,27 @@ class LogsumexpGradKernel : public framework::OpKernel { broadcast_dim[0]); } else { int rank = input->dims().size(); + LogsumexpGradFunctor functor; switch (rank) { case 1: ReduceGradFunctor( context.template device_context(), *input, *output, - *output_grad, input_grad, axis); + *output_grad, input_grad, functor, axis); break; case 2: ReduceGradFunctor( context.template device_context(), *input, *output, - *output_grad, input_grad, axis); + *output_grad, input_grad, functor, axis); break; case 3: ReduceGradFunctor( context.template device_context(), *input, *output, - *output_grad, input_grad, axis); + *output_grad, input_grad, functor, axis); break; case 4: ReduceGradFunctor( context.template device_context(), *input, *output, - *output_grad, input_grad, axis); + *output_grad, input_grad, functor, axis); break; } } diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h index 2e5bd7a42b1d1a..661fb772f1c573 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_op.h +++ b/paddle/fluid/operators/reduce_ops/reduce_op.h @@ -143,7 +143,7 @@ void HandleLargeDimGrad(const framework::ExecutionContext& context, const framework::Tensor* x, const framework::Tensor* out, const framework::Tensor* dout, framework::Tensor* dx, - const std::vector& dims) { + Functor functor, const std::vector& dims) { const int64_t unreduced = out->numel(); const int64_t reduced = x->numel() / unreduced; DDim out_dim(out->dims()); @@ -157,7 +157,7 @@ void HandleLargeDimGrad(const framework::ExecutionContext& context, dx->Resize({unreduced, reduced}); ReduceGradFunctor( context.template device_context(), shuffled_x, *out, *dout, - dx, {1}); + dx, functor, {1}); // transpose dX std::vector origin_axis(x_dim.size()); GetOriginDimFromShuffled(x_dim, dims, &origin_axis); @@ -333,7 +333,7 @@ void LaunchReduceGradKernel(const framework::ExecutionContext& context, const framework::Tensor* input0, const framework::Tensor* input1, const framework::Tensor* input2, - paddle::framework::Tensor* output, + paddle::framework::Tensor* output, Functor functor, const std::vector& dims, bool reduce_all = false) { if (reduce_all) { @@ -345,7 +345,6 @@ void LaunchReduceGradKernel(const framework::ExecutionContext& context, *context.template device_context().eigen_device(); auto broadcast_dim = Eigen::array({{static_cast(input0->numel())}}); - Functor functor; functor(place, &x, &x_reduce, &x_grad, &x_reduce_grad, broadcast_dim, broadcast_dim[0]); } else { @@ -354,36 +353,36 @@ void LaunchReduceGradKernel(const framework::ExecutionContext& context, case 1: ReduceGradFunctor( context.template device_context(), *input0, *input1, - *input2, output, dims); + *input2, output, functor, dims); break; case 2: ReduceGradFunctor( context.template device_context(), *input0, *input1, - *input2, output, dims); + *input2, output, functor, dims); break; case 3: ReduceGradFunctor( context.template device_context(), *input0, *input1, - *input2, output, dims); + *input2, output, functor, dims); break; case 4: ReduceGradFunctor( context.template device_context(), *input0, *input1, - *input2, output, dims); + *input2, output, functor, dims); break; case 5: ReduceGradFunctor( context.template device_context(), *input0, *input1, - *input2, output, dims); + *input2, output, functor, dims); break; case 6: ReduceGradFunctor( context.template device_context(), *input0, *input1, - *input2, output, dims); + *input2, output, functor, dims); break; default: - HandleLargeDimGrad(context, input0, input1, - input2, output, dims); + HandleLargeDimGrad( + context, input0, input1, input2, output, functor, dims); break; } } @@ -430,8 +429,10 @@ class ReduceGradKernel : public framework::OpKernel { // NOTE(dengkaipeng): Out is unnecessary in some reduce kernel and // not be set as Input in grad Maker, use Out_grad to replace here if (!input1) input1 = input2; - LaunchReduceGradKernel( - context, input0, input1, input2, output, const_dims, reduce_all); + Functor functor; + LaunchReduceGradKernel(context, input0, input1, + input2, output, functor, + const_dims, reduce_all); } void Compute(const framework::ExecutionContext& context) const override { diff --git a/paddle/fluid/operators/reduce_ops/reduce_op_function.h b/paddle/fluid/operators/reduce_ops/reduce_op_function.h index 3da27bc8ac8d44..1f3839c8dc7e6d 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_op_function.h +++ b/paddle/fluid/operators/reduce_ops/reduce_op_function.h @@ -74,7 +74,7 @@ void ReduceGradFunctor(const DeviceContext& context, const framework::Tensor& input0, const framework::Tensor& input1, const framework::Tensor& input2, - framework::Tensor* output, + framework::Tensor* output, Functor functor, const std::vector& dims) { auto x = EigenTensor::From(input0); auto x_grad = EigenTensor::From(*output); @@ -100,7 +100,6 @@ void ReduceGradFunctor(const DeviceContext& context, auto& place = *context.eigen_device(); - Functor functor; functor(place, &x, &x_reduce, &x_grad, &x_reduce_grad, broadcast_dim, broad_cats_times); } diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc index dc82d7c6c1ee49..5170729a7692f3 100644 --- a/paddle/fluid/operators/reshape_op.cc +++ b/paddle/fluid/operators/reshape_op.cc @@ -448,7 +448,8 @@ class ReshapeKernel { #ifdef PADDLE_WITH_XPU if (platform::is_xpu_place(ctx.GetPlace())) { auto &dev_ctx = ctx.device_context(); - pten::ReshapeKernel(dev_ctx, *pt_x.get(), pt_scalar_shape, pt_out); + pten::ReshapeKernel(static_cast(dev_ctx), + *pt_x.get(), pt_scalar_shape, pt_out); } #endif // non-inplace need move all result from pt_out to out, inplace need set @@ -485,7 +486,8 @@ class ReshapeGradKernel { #ifdef PADDLE_WITH_XPU if (platform::is_xpu_place(ctx.GetPlace())) { auto &dev_ctx = ctx.device_context(); - pten::ReshapeGradKernel(dev_ctx, *pt_d_out.get(), pt_d_x.get()); + pten::ReshapeGradKernel(static_cast(dev_ctx), + *pt_d_out.get(), pt_d_x.get()); } #endif } @@ -516,7 +518,9 @@ class ReshapeDoubleGradKernel { #ifdef PADDLE_WITH_XPU if (platform::is_xpu_place(ctx.GetPlace())) { auto &dev_ctx = ctx.device_context(); - pten::ReshapeDoubleGradKernel(dev_ctx, *pt_dd_x.get(), pt_dd_out.get()); + pten::ReshapeDoubleGradKernel( + static_cast(dev_ctx), *pt_dd_x.get(), + pt_dd_out.get()); } #endif } diff --git a/paddle/fluid/operators/run_program_op.h b/paddle/fluid/operators/run_program_op.h index c130dbb35a0daa..a97876957abd38 100644 --- a/paddle/fluid/operators/run_program_op.h +++ b/paddle/fluid/operators/run_program_op.h @@ -46,7 +46,7 @@ using ProgramDesc = framework::ProgramDesc; using Variable = framework::Variable; using LoDTensor = framework::LoDTensor; -using SelectedRows = framework::SelectedRows; +using SelectedRows = pten::SelectedRows; namespace details { @@ -86,21 +86,21 @@ static void CheckOutputVarStatus(const Variable &src_var, "RunProgram(Grad)Op's internal " "scope is not initialized.", var_name)); - } else if (dst_var.IsType()) { + } else if (dst_var.IsType()) { PADDLE_ENFORCE_EQ( - src_var.IsType(), true, + src_var.IsType(), true, platform::errors::InvalidArgument( "The output variable %s get from " "RunProgram(Grad)Op's internal scope holds " "wrong type. Expect type is SelectedRows, but receive type is %s.", var_name, platform::demangle(framework::ToTypeName(src_var.Type())))); - PADDLE_ENFORCE_EQ(src_var.Get().value().IsInitialized(), true, - platform::errors::InvalidArgument( - "The tensor in output variable %s get from " - "RunProgram(Grad)Op's " - "internal scope is not initialized.", - var_name)); + PADDLE_ENFORCE_EQ(src_var.Get().value().IsInitialized(), + true, platform::errors::InvalidArgument( + "The tensor in output variable %s get from " + "RunProgram(Grad)Op's " + "internal scope is not initialized.", + var_name)); } else { PADDLE_THROW(platform::errors::InvalidArgument( @@ -118,12 +118,12 @@ static void VariableShare(const Variable &src_var, Variable *dst_var) { auto *lod_tensor = dst_var->GetMutable(); lod_tensor->ShareDataWith(src_var.Get()); lod_tensor->set_lod(src_var.Get().lod()); - } else if (src_var.IsType()) { - auto *selected_rows = dst_var->GetMutable(); + } else if (src_var.IsType()) { + auto *selected_rows = dst_var->GetMutable(); selected_rows->mutable_value()->ShareDataWith( - src_var.Get().value()); - selected_rows->set_rows(src_var.Get().rows()); - selected_rows->set_height(src_var.Get().height()); + src_var.Get().value()); + selected_rows->set_rows(src_var.Get().rows()); + selected_rows->set_height(src_var.Get().height()); } } diff --git a/paddle/fluid/operators/save_op.h b/paddle/fluid/operators/save_op.h index 5ed71a26c8aa35..2a61d7ce0c25b7 100644 --- a/paddle/fluid/operators/save_op.h +++ b/paddle/fluid/operators/save_op.h @@ -56,7 +56,7 @@ class SaveOpKernel : public framework::OpKernel { if (input_var->IsType()) { SaveLodTensor(ctx, place, input_var, filename); - } else if (input_var->IsType()) { + } else if (input_var->IsType()) { SaveSelectedRows(ctx, place, input_var, filename); } else { PADDLE_THROW(platform::errors::InvalidArgument( @@ -105,7 +105,7 @@ class SaveOpKernel : public framework::OpKernel { const platform::Place &place, const framework::Variable *var, const std::string &filename) const { - auto &selectedRows = var->Get(); + auto &selectedRows = var->Get(); // get device context from pool platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); diff --git a/paddle/fluid/operators/scale_op.h b/paddle/fluid/operators/scale_op.h index a04837b6949e0f..2a30d3f0b08842 100644 --- a/paddle/fluid/operators/scale_op.h +++ b/paddle/fluid/operators/scale_op.h @@ -55,9 +55,9 @@ class ScaleKernel : public framework::OpKernel { } auto* out_var = ctx.OutputVar("Out"); - if (in_var->IsType() && in_var != out_var) { - auto& in_slr = in_var->Get(); - auto* out_slr = out_var->GetMutable(); + if (in_var->IsType() && in_var != out_var) { + auto& in_slr = in_var->Get(); + auto* out_slr = out_var->GetMutable(); out_slr->set_rows(in_slr.rows()); out_slr->set_height(in_slr.height()); } diff --git a/paddle/fluid/operators/scale_op_mlu.cc b/paddle/fluid/operators/scale_op_mlu.cc index 8d9690a866ae26..1e1187845ce477 100644 --- a/paddle/fluid/operators/scale_op_mlu.cc +++ b/paddle/fluid/operators/scale_op_mlu.cc @@ -57,9 +57,9 @@ class ScaleMLUKernel : public framework::OpKernel { MLUCnnl::Fill(ctx, bias, bias_desc.get(), GetBasePtr(&bias_tensor)); auto* out_var = ctx.OutputVar("Out"); - if (in_var->IsType() && in_var != out_var) { - auto& in_slr = in_var->Get(); - auto* out_slr = out_var->GetMutable(); + if (in_var->IsType() && in_var != out_var) { + auto& in_slr = in_var->Get(); + auto* out_slr = out_var->GetMutable(); out_slr->set_rows(in_slr.rows()); out_slr->set_height(in_slr.height()); } diff --git a/paddle/fluid/operators/scale_op_xpu.cc b/paddle/fluid/operators/scale_op_xpu.cc index 75f3cc9c5aa300..026a5dda89b5f0 100644 --- a/paddle/fluid/operators/scale_op_xpu.cc +++ b/paddle/fluid/operators/scale_op_xpu.cc @@ -32,9 +32,9 @@ class ScaleXPUKernel : public framework::OpKernel { auto bias = static_cast(ctx.Attr("bias")); auto bias_after_scale = ctx.Attr("bias_after_scale"); auto* out_var = ctx.OutputVar("Out"); - if (in_var->IsType() && in_var != out_var) { - auto& in_slr = in_var->Get(); - auto* out_slr = out_var->GetMutable(); + if (in_var->IsType() && in_var != out_var) { + auto& in_slr = in_var->Get(); + auto* out_slr = out_var->GetMutable(); out_slr->set_rows(in_slr.rows()); out_slr->set_height(in_slr.height()); } diff --git a/paddle/fluid/operators/shape_op.h b/paddle/fluid/operators/shape_op.h index 1f90c041c09533..cac8c10c207a51 100644 --- a/paddle/fluid/operators/shape_op.h +++ b/paddle/fluid/operators/shape_op.h @@ -21,7 +21,7 @@ namespace operators { using Tensor = framework::Tensor; using LoDTensor = framework::LoDTensor; -using SelectedRows = framework::SelectedRows; +using SelectedRows = pten::SelectedRows; template class ShapeKernel : public framework::OpKernel { @@ -29,8 +29,8 @@ class ShapeKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& ctx) const override { auto* in_var = ctx.InputVar("Input"); framework::DDim in_dims; - if (in_var->IsType()) { - in_dims = in_var->Get().value().dims(); + if (in_var->IsType()) { + in_dims = in_var->Get().value().dims(); } else { in_dims = in_var->Get().dims(); } diff --git a/paddle/fluid/operators/shape_op_npu.cc b/paddle/fluid/operators/shape_op_npu.cc index 94f4737191d11a..89a1e952d1dc55 100644 --- a/paddle/fluid/operators/shape_op_npu.cc +++ b/paddle/fluid/operators/shape_op_npu.cc @@ -22,7 +22,6 @@ namespace paddle { namespace operators { using Tensor = framework::Tensor; -using SelectedRows = framework::SelectedRows; template class ShapeNPUKernel : public framework::OpKernel { @@ -30,8 +29,8 @@ class ShapeNPUKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& ctx) const override { auto* in_var = ctx.InputVar("Input"); framework::DDim in_dims; - if (in_var->IsType()) { - in_dims = in_var->Get().value().dims(); + if (in_var->IsType()) { + in_dims = in_var->Get().value().dims(); } else { in_dims = in_var->Get().dims(); } diff --git a/paddle/fluid/operators/share_data_op.h b/paddle/fluid/operators/share_data_op.h index d876b4fabd5c09..f668a1cf01dfc5 100644 --- a/paddle/fluid/operators/share_data_op.h +++ b/paddle/fluid/operators/share_data_op.h @@ -29,9 +29,8 @@ class ShareDataKernel : public framework::OpKernel { auto *detach_tensor = out_var->GetMutable(); detach_tensor->ShareDataWith(origin_tensor); } else { - const auto &origin_selected_rows = in_var->Get(); - auto *detach_selected_rows = - out_var->GetMutable(); + const auto &origin_selected_rows = in_var->Get(); + auto *detach_selected_rows = out_var->GetMutable(); detach_selected_rows->mutable_value()->ShareDataWith( origin_selected_rows.value()); } diff --git a/paddle/fluid/operators/softmax_op_xpu.cc b/paddle/fluid/operators/softmax_op_xpu.cc index 0adc12e684c3a4..a0d4b4c4eb4604 100644 --- a/paddle/fluid/operators/softmax_op_xpu.cc +++ b/paddle/fluid/operators/softmax_op_xpu.cc @@ -45,8 +45,8 @@ class SoftmaxXPUKernel : public framework::OpKernel { auto& dev_ctx = context.template device_context(); int r = XPU_SUCCESS; - paddle::platform::XPUVersion version = dev_ctx.xpu_version(); - if (version == paddle::platform::XPUVersion::XPU1) { + auto version = dev_ctx.xpu_version(); + if (version == pten::backends::xpu::XPUVersion::XPU1) { xpu::ctx_guard RAII_GUARD(dev_ctx.x_context()); XPUType* clip_x_data_l3 = RAII_GUARD.alloc_l3_or_gm(x->numel()); r = xpu::clip_v2(dev_ctx.x_context(), diff --git a/paddle/fluid/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc index 0f520adba57a20..00aab6b75006ae 100644 --- a/paddle/fluid/operators/sum_op.cc +++ b/paddle/fluid/operators/sum_op.cc @@ -165,9 +165,9 @@ class SumOp : public framework::OperatorWithKernel { return framework::OpKernelType(data_type, ctx.GetPlace(), layout, library); - } else if (x_vars[0]->IsType()) { + } else if (x_vars[0]->IsType()) { for (auto& var : x_vars) { - auto& value = var->Get().value(); + auto& value = var->Get().value(); if (value.IsInitialized()) { return framework::OpKernelType(value.type(), ctx.device_context(), layout, library); diff --git a/paddle/fluid/operators/sum_op.cu b/paddle/fluid/operators/sum_op.cu index 4288e9415aa869..9de9b0b6338dfc 100644 --- a/paddle/fluid/operators/sum_op.cu +++ b/paddle/fluid/operators/sum_op.cu @@ -151,7 +151,7 @@ void SumToLoDTensor(const framework::ExecutionContext &context) { if (lod_length && in_i.IsInitialized()) { in_data.emplace_back(in_i.data()); } - } else if (in_vars[i]->IsType()) { + } else if (in_vars[i]->IsType()) { selectrow_index.push_back(i); } } @@ -162,7 +162,7 @@ void SumToLoDTensor(const framework::ExecutionContext &context) { size_t rows = 0; int64_t length = 0; for (auto index : selectrow_index) { - auto &sr = in_vars[index]->Get(); + auto &sr = in_vars[index]->Get(); auto &sr_value = sr.value(); auto &sr_rows = sr.rows(); @@ -235,7 +235,7 @@ class SumKernel if (out_var->IsType()) { SumToLoDTensor(context); - } else if (out_var->IsType()) { + } else if (out_var->IsType()) { SelectedRowsCompute(context); } else if (out_var->IsType()) { LodTensorArrayCompute(context); diff --git a/paddle/fluid/operators/sum_op.h b/paddle/fluid/operators/sum_op.h index 61a9c8b11508f2..4e108b56a404d5 100644 --- a/paddle/fluid/operators/sum_op.h +++ b/paddle/fluid/operators/sum_op.h @@ -21,7 +21,7 @@ namespace paddle { namespace operators { using Tensor = framework::Tensor; -using SelectedRows = framework::SelectedRows; +using SelectedRows = pten::SelectedRows; using LoDTensor = framework::LoDTensor; template @@ -37,32 +37,32 @@ void SelectedRowsCompute(const framework::ExecutionContext &context) { return; } - std::vector inputs; + std::vector inputs; SelectedRows temp_in0; if (in_place) { - auto &in0 = in_vars[0]->Get(); + auto &in0 = in_vars[0]->Get(); temp_in0.set_height(in0.height()); temp_in0.set_rows(in0.rows()); framework::TensorCopy(in0.value(), in0.place(), context.device_context(), temp_in0.mutable_value()); inputs.push_back(&temp_in0); for (size_t i = 1; i < in_vars.size(); ++i) { - auto &in = in_vars[i]->Get(); + auto &in = in_vars[i]->Get(); if (in.rows().size() > 0) { inputs.push_back(&in); } } } else { for (auto &in_var : in_vars) { - auto &in = in_var->Get(); + auto &in = in_var->Get(); if (in.rows().size() > 0) { - inputs.push_back(&in_var->Get()); + inputs.push_back(&in_var->Get()); } } } - auto *out = context.Output("Out"); + auto *out = context.Output("Out"); out->mutable_rows()->clear(); bool has_data = false; @@ -183,8 +183,8 @@ class SumKernel : public framework::OpKernel { } auto in = EigenVector::Flatten(in_t); result.device(place) = result + in; - } else if (in_vars[i]->IsType()) { - auto &in_t = in_vars[i]->Get(); + } else if (in_vars[i]->IsType()) { + auto &in_t = in_vars[i]->Get(); functor(context.template device_context(), in_t, out); } else { PADDLE_THROW(platform::errors::InvalidArgument( @@ -194,7 +194,7 @@ class SumKernel : public framework::OpKernel { framework::ToTypeName(in_vars[i]->Type()))); } } - } else if (out_var->IsType()) { + } else if (out_var->IsType()) { SelectedRowsCompute(context); } else if (out_var->IsType()) { LodTensorArrayCompute(context); diff --git a/paddle/fluid/operators/uniform_random_op.cc b/paddle/fluid/operators/uniform_random_op.cc index cdb4ad7c40826b..8c603a7c5d8c8f 100644 --- a/paddle/fluid/operators/uniform_random_op.cc +++ b/paddle/fluid/operators/uniform_random_op.cc @@ -73,8 +73,8 @@ class CPUUniformRandomKernel : public framework::OpKernel { } } - if (out_var->IsType()) { - auto *selected_rows = out_var->GetMutable(); + if (out_var->IsType()) { + auto *selected_rows = out_var->GetMutable(); tensor = selected_rows->mutable_value(); auto shape = ctx.Attr>("shape"); if (!new_shape.empty()) shape = new_shape; diff --git a/paddle/fluid/operators/uniform_random_op.cu b/paddle/fluid/operators/uniform_random_op.cu index 63eecd15c2d69b..5278bdd2f1c725 100644 --- a/paddle/fluid/operators/uniform_random_op.cu +++ b/paddle/fluid/operators/uniform_random_op.cu @@ -111,8 +111,8 @@ class GPUUniformRandomKernel : public framework::OpKernel { } } - if (out_var->IsType()) { - auto* selected_rows = out_var->GetMutable(); + if (out_var->IsType()) { + auto* selected_rows = out_var->GetMutable(); tensor = selected_rows->mutable_value(); auto shape = context.Attr>("shape"); if (!new_shape.empty()) shape = new_shape; diff --git a/paddle/fluid/operators/uniform_random_op_npu.cc b/paddle/fluid/operators/uniform_random_op_npu.cc index 1c2f2b07ce8975..6812a2b0b7085c 100644 --- a/paddle/fluid/operators/uniform_random_op_npu.cc +++ b/paddle/fluid/operators/uniform_random_op_npu.cc @@ -40,8 +40,8 @@ class NPUUniformRandomKernel : public framework::OpKernel { } } - if (out_var->IsType()) { - auto *selected_rows = out_var->GetMutable(); + if (out_var->IsType()) { + auto *selected_rows = out_var->GetMutable(); tensor = selected_rows->mutable_value(); auto shape = ctx.Attr>("shape"); if (!new_shape.empty()) shape = new_shape; diff --git a/paddle/fluid/operators/uniform_random_op_xpu.cc b/paddle/fluid/operators/uniform_random_op_xpu.cc index fed0accd8a14cd..848b72727bd282 100644 --- a/paddle/fluid/operators/uniform_random_op_xpu.cc +++ b/paddle/fluid/operators/uniform_random_op_xpu.cc @@ -41,8 +41,8 @@ class XPUUniformRandomKernel : public framework::OpKernel { } } - if (out_var->IsType()) { - auto *selected_rows = out_var->GetMutable(); + if (out_var->IsType()) { + auto *selected_rows = out_var->GetMutable(); tensor = selected_rows->mutable_value(); auto shape = ctx.Attr>("shape"); if (!new_shape.empty()) shape = new_shape; diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index 97a31752333572..5695fd03bacf38 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -121,6 +121,9 @@ cc_library(cudnn_workspace_helper SRCS cudnn_workspace_helper.cc DEPS boost) cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool malloc xxhash ${STREAM_CALLBACK_DEPS} place pten_place eigen3 stringpiece cpu_helper cpu_info framework_proto ${IPU_CTX_DEPS} ${GPU_CTX_DEPS} ${NPU_CTX_DEPS} ${MKLDNN_CTX_DEPS} ${dgc_deps} dlpack cudnn_workspace_helper ${XPU_CTX_DEPS} ${MLU_CTX_DEPS} cpu_context) +if(WITH_XPU) + target_link_libraries(device_context xpu_context) +endif() cc_library(collective_helper SRCS collective_helper.cc gen_comm_id_helper.cc DEPS framework_proto device_context enforce) if(WITH_ASCEND_CL) diff --git a/paddle/fluid/platform/device/gpu/gpu_launch_config.h b/paddle/fluid/platform/device/gpu/gpu_launch_config.h index 883767348f06a9..d07ef73a49e799 100644 --- a/paddle/fluid/platform/device/gpu/gpu_launch_config.h +++ b/paddle/fluid/platform/device/gpu/gpu_launch_config.h @@ -165,8 +165,6 @@ inline GpuLaunchConfig GetGpuLaunchConfig2D( return config; } -// TODO(wangchaochaohu): 3D will add later - } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/device/xpu/CMakeLists.txt b/paddle/fluid/platform/device/xpu/CMakeLists.txt index f89c8c193ae7cf..d292ce130eb34a 100644 --- a/paddle/fluid/platform/device/xpu/CMakeLists.txt +++ b/paddle/fluid/platform/device/xpu/CMakeLists.txt @@ -4,7 +4,7 @@ endif() set(XPU_CTX_DEPS xpulib ssl crypto rt z resolv dl) -cc_library(xpu_info SRCS xpu_info.cc DEPS gflags glog enforce xpulib device_context place) +cc_library(xpu_info SRCS xpu_info.cc DEPS gflags glog enforce xpulib device_context place pten_xpu_info) cc_library(xpu_op_list SRCS xpu_op_list.cc DEPS gflags glog enforce xpulib device_context) add_subdirectory(tests) diff --git a/paddle/fluid/platform/device/xpu/enforce_xpu.h b/paddle/fluid/platform/device/xpu/enforce_xpu.h index 4c85168f68dd3a..ae5ec8e851d688 100644 --- a/paddle/fluid/platform/device/xpu/enforce_xpu.h +++ b/paddle/fluid/platform/device/xpu/enforce_xpu.h @@ -15,177 +15,36 @@ limitations under the License. */ #pragma once #include "paddle/fluid/platform/device/xpu/xpu_header.h" -#include "paddle/fluid/platform/enforce.h" -#include "xpu/bkcl.h" + +#include "paddle/pten/backends/xpu/enforce_xpu.h" namespace paddle { namespace platform { // Note: XPU runtime api return int, not XPUError_t inline const char* xpuGetErrorString(int stat) { - switch (stat) { - case XPU_SUCCESS: - return "Success"; - case XPUERR_INVALID_DEVICE: - return "Invalid XPU device"; - case XPUERR_UNINIT: - return "XPU runtime not properly inited"; - case XPUERR_NOMEM: - return "Device memory not enough"; - case XPUERR_NOCPUMEM: - return "CPU memory not enough"; - case XPUERR_INVALID_PARAM: - return "Invalid parameter"; - case XPUERR_NOXPUFUNC: - return "Cannot get XPU Func"; - case XPUERR_LDSO: - return "Error loading dynamic library"; - case XPUERR_LDSYM: - return "Error loading func from dynamic library"; - case XPUERR_SIMULATOR: - return "Error from XPU Simulator"; - case XPUERR_NOSUPPORT: - return "Operation not supported"; - case XPUERR_ABNORMAL: - return "Device abnormal due to previous error"; - case XPUERR_KEXCEPTION: - return "Exception in kernel execution"; - case XPUERR_TIMEOUT: - return "Kernel execution timed out"; - case XPUERR_BUSY: - return "Resource busy"; - case XPUERR_USEAFCLOSE: - return "Use a stream after closed"; - case XPUERR_UCECC: - return "Uncorrectable ECC"; - case XPUERR_OVERHEAT: - return "Overheat"; - case XPUERR_UNEXPECT: - return "Execution error, reach unexpected control flow"; - case XPUERR_DEVRESET: - return "Device is being reset, try again later"; - case XPUERR_HWEXCEPTION: - return "Hardware module exception"; - case XPUERR_HBM_INIT: - return "Error init HBM"; - case XPUERR_DEVINIT: - return "Error init device"; - case XPUERR_PEERRESET: - return "Device is being reset, try again later"; - case XPUERR_MAXDEV: - return "Device count exceed limit"; - case XPUERR_NOIOC: - return "Unknown IOCTL command"; - case XPUERR_DMATIMEOUT: - return "DMA timed out, a reboot maybe needed"; - case XPUERR_DMAABORT: - return "DMA aborted due to error, possibly wrong address or hardware " - "state"; - case XPUERR_MCUUNINIT: - return "Firmware not initialized"; - case XPUERR_OLDFW: - return "Firmware version too old (<15), please update."; - case XPUERR_PCIE: - return "Error in PCIE"; - case XPUERR_FAULT: - return "Error copy between kernel and user space"; - case XPUERR_INTERRUPTED: - return "Execution interrupted by user"; - default: - return "unkonwn error"; - } + return pten::backends::xpu::xpuGetErrorString(stat); } inline const char* bkclGetErrorString(BKCLResult_t stat) { - switch (stat) { - case BKCL_SUCCESS: - return "BKCL_SUCCESS"; - case BKCL_INVALID_ARGUMENT: - return "BKCL_INVALID_ARGUMENT"; - case BKCL_RUNTIME_ERROR: - return "BKCL_RUNTIME_ERROR"; - case BKCL_SYSTEM_ERROR: - return "BKCL_SYSTEM_ERROR"; - case BKCL_INTERNAL_ERROR: - return "BKCL_INTERNAL_ERROR"; - default: - return "Unknown BKCL status"; - } + return pten::backends::xpu::bkclGetErrorString(stat); } inline const char* xdnnGetErrorString(int stat) { - switch (stat) { - case xpu::Error_t::SUCCESS: - return "XDNN_SUCCESS"; - case xpu::Error_t::INVALID_PARAM: - return "XDNN_INVALID_PARAM"; - case xpu::Error_t::RUNTIME_ERROR: - return "XDNN_RUNTIME_ERROR"; - case xpu::Error_t::NO_ENOUGH_WORKSPACE: - return "XDNN_NO_ENOUGH_WORKSPACE"; - case xpu::Error_t::NOT_IMPLEMENT: - return "XDNN_NOT_IMPLEMENT"; - default: - return "Unknown XDNN status"; - } + return pten::backends::xpu::xdnnGetErrorString(stat); } inline std::string build_xpu_error_msg(int stat) { - std::string msg("XPU Error <" + std::to_string(stat) + ">, "); - return msg + xpuGetErrorString(stat) + " "; + return pten::backends::xpu::build_xpu_error_msg(stat); } inline std::string build_xpu_error_msg(BKCLResult_t stat) { - std::string msg("BKCL Error, "); - return msg + bkclGetErrorString(stat) + " "; + return pten::backends::xpu::build_xpu_error_msg(stat); } inline std::string build_xpu_xdnn_error_msg(int stat, std::string msg) { - return msg + " XDNN Error, " + xdnnGetErrorString(stat) + " "; + return pten::backends::xpu::build_xpu_xdnn_error_msg(stat, msg); } -namespace details { - -template -struct ExternalApiType {}; - -#define DEFINE_EXTERNAL_API_TYPE(type, success_value) \ - template <> \ - struct ExternalApiType { \ - using Type = type; \ - static constexpr Type kSuccess = success_value; \ - } - -DEFINE_EXTERNAL_API_TYPE(int, XPU_SUCCESS); -DEFINE_EXTERNAL_API_TYPE(BKCLResult_t, BKCL_SUCCESS); - -#undef DEFINE_EXTERNAL_API_TYPE - -} // namespace details - -#define PADDLE_ENFORCE_XPU_SUCCESS(COND) \ - do { \ - auto __cond__ = (COND); \ - using __XPU_STATUS_TYPE__ = decltype(__cond__); \ - constexpr auto __success_type__ = \ - ::paddle::platform::details::ExternalApiType< \ - __XPU_STATUS_TYPE__>::kSuccess; \ - if (UNLIKELY(__cond__ != __success_type__)) { \ - auto __summary__ = paddle::platform::errors::External( \ - ::paddle::platform::build_xpu_error_msg(__cond__)); \ - __THROW_ERROR_INTERNAL__(__summary__); \ - } \ - } while (0) - -#define PADDLE_ENFORCE_XDNN_SUCCESS(COND, MSG) \ - do { \ - auto __cond__ = (COND); \ - if (UNLIKELY(__cond__ != xpu::Error_t::SUCCESS)) { \ - auto __summary__ = paddle::platform::errors::External( \ - ::paddle::platform::build_xpu_xdnn_error_msg(__cond__, MSG)); \ - __THROW_ERROR_INTERNAL__(__summary__); \ - } \ - } while (0) - } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/device/xpu/xpu_header.h b/paddle/fluid/platform/device/xpu/xpu_header.h index 1177fd63742b3b..6b5c32fd511b36 100644 --- a/paddle/fluid/platform/device/xpu/xpu_header.h +++ b/paddle/fluid/platform/device/xpu/xpu_header.h @@ -15,42 +15,5 @@ limitations under the License. */ #pragma once #ifdef PADDLE_WITH_XPU -#include -#include -#include - -#include "paddle/fluid/platform/bfloat16.h" -#include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/float16.h" - -#include "xpu/runtime.h" -#include "xpu/runtime_ex.h" -#include "xpu/xdnn.h" - -namespace xpu = baidu::xpu::api; - -static std::map XPUAPIErrorMsg = { - {xpu::Error_t::SUCCESS, "xpu api success"}, - {xpu::Error_t::INVALID_PARAM, "xpu api invalid param"}, - {xpu::Error_t::RUNTIME_ERROR, "xpu api runtime error"}, - {xpu::Error_t::NO_ENOUGH_WORKSPACE, "xpu api no enough workspace"}}; - -template -class XPUTypeTrait { - public: - using Type = T; -}; - -template <> -class XPUTypeTrait { - public: - using Type = float16; -}; - -template <> -class XPUTypeTrait { - public: - using Type = bfloat16; -}; - +#include "paddle/pten/backends/xpu/xpu_header.h" #endif diff --git a/paddle/fluid/platform/device/xpu/xpu_info.cc b/paddle/fluid/platform/device/xpu/xpu_info.cc index a8c6ee8f3b0353..cf08f9ada6b300 100644 --- a/paddle/fluid/platform/device/xpu/xpu_info.cc +++ b/paddle/fluid/platform/device/xpu/xpu_info.cc @@ -14,22 +14,14 @@ limitations under the License. */ #include #include #include "gflags/gflags.h" + #include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/fluid/platform/device/xpu/enforce_xpu.h" #include "paddle/fluid/platform/device/xpu/xpu_header.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/place.h" -#include "paddle/fluid/string/split.h" - -PADDLE_DEFINE_EXPORTED_string( - selected_xpus, "", - "A list of device ids separated by comma, like: 0,1,2,3. " - "This option is useful when doing multi process training and " - "each process have only one device (XPU). If you want to use " - "all visible devices, set this to empty string. NOTE: the " - "reason of doing this is that we want to use P2P communication" - "between XPU devices, use XPU_VISIBLE_DEVICES can only use" - "share-memory only."); + +#include "paddle/pten/backends/xpu/xpu_info.h" namespace paddle { namespace platform { @@ -37,101 +29,40 @@ namespace platform { /**************************** Version Management **************************/ //! Get the version of XPU Driver -int GetDriverVersion() { - uint32_t driver_version_major = 0; - uint32_t driver_version_minor = 0; - PADDLE_ENFORCE_XPU_SUCCESS( - xpu_get_driver_version(&driver_version_major, &driver_version_minor)); - int driver_version = driver_version_major * 10 + driver_version_minor; - return driver_version; -} +int GetDriverVersion() { return pten::backends::xpu::GetDriverVersion(); } //! Get the version of XPU Runtime -int GetRuntimeVersion() { - uint32_t rumtime_version_major = 0; - uint32_t rumtime_version_minor = 0; - PADDLE_ENFORCE_XPU_SUCCESS( - xpu_get_runtime_version(&rumtime_version_major, &rumtime_version_minor)); - int runtime_version = rumtime_version_major * 10 + rumtime_version_minor; - return runtime_version; -} +int GetRuntimeVersion() { return pten::backends::xpu::GetRuntimeVersion(); } /**************************** Device Management **************************/ -static int GetDeviceCountImpl() { - const auto* xpu_visible_devices = std::getenv("XPU_VISIBLE_DEVICES"); - if (xpu_visible_devices != nullptr) { - std::string xpu_visible_devices_str(xpu_visible_devices); - if (std::all_of(xpu_visible_devices_str.begin(), - xpu_visible_devices_str.end(), - [](char ch) { return ch == ' '; })) { - VLOG(2) << "XPU_VISIBLE_DEVICES is set to be empty. No XPU detected."; - return 0; - } - } - - int count = 0; - PADDLE_ENFORCE_XPU_SUCCESS(xpu_device_count(&count)); - return count; -} - -int GetXPUDeviceCount() { - static auto dev_cnt = GetDeviceCountImpl(); - return dev_cnt; -} +int GetXPUDeviceCount() { return pten::backends::xpu::GetXPUDeviceCount(); } int GetXPUCurrentDeviceId() { - int dev_id; - PADDLE_ENFORCE_XPU_SUCCESS(xpu_current_device(&dev_id)); - if (dev_id >= 64) { - // if dev_id >= 64, the device is a simulator device, -64 to get real dev_id - dev_id -= 64; - } - return dev_id; + return pten::backends::xpu::GetXPUCurrentDeviceId(); } -void SetXPUDeviceId(int id) { - PADDLE_ENFORCE_LT( - id, GetXPUDeviceCount(), - platform::errors::InvalidArgument("id must less than XPU count")); - PADDLE_ENFORCE_XPU_SUCCESS(xpu_set_device(id)); -} +void SetXPUDeviceId(int id) { pten::backends::xpu::SetXPUDeviceId(id); } //! Get a list of device ids from environment variable or use all. std::vector GetXPUSelectedDevices() { // use user specified XPUs in single-node multi-process mode. - std::vector devices; - if (!FLAGS_selected_xpus.empty()) { - auto devices_str = paddle::string::Split(FLAGS_selected_xpus, ','); - for (auto id : devices_str) { - devices.push_back(atoi(id.c_str())); - } - } else { - int count = GetXPUDeviceCount(); - for (int i = 0; i < count; ++i) { - devices.push_back(i); - } - } - return devices; + return pten::backends::xpu::GetXPUSelectedDevices(); } /**************************** Memory Management **************************/ void MemcpySyncH2D(void* dst, const void* src, size_t count, const platform::XPUPlace& dst_place) { - platform::XPUDeviceGuard guard(dst_place.device); - PADDLE_ENFORCE_XPU_SUCCESS( - xpu_memcpy(dst, src, count, XPUMemcpyKind::XPU_HOST_TO_DEVICE)); + pten::backends::xpu::MemcpySyncH2D(dst, src, count, dst_place); } void MemcpySyncD2H(void* dst, const void* src, size_t count, const platform::XPUPlace& src_place) { - platform::XPUDeviceGuard guard(src_place.device); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); auto* dev_ctx = pool.GetByPlace(src_place); dev_ctx->Wait(); - PADDLE_ENFORCE_XPU_SUCCESS( - xpu_memcpy(dst, src, count, XPUMemcpyKind::XPU_DEVICE_TO_HOST)); + pten::backends::xpu::MemcpySyncD2H(dst, src, count, src_place, *dev_ctx); } // if src.device == dst.device and you need sync , after call this function, @@ -139,33 +70,16 @@ void MemcpySyncD2H(void* dst, const void* src, size_t count, void MemcpySyncD2D(void* dst, const platform::XPUPlace& dst_place, const void* src, const platform::XPUPlace& src_place, size_t count) { - int dev_id = GetXPUCurrentDeviceId(); - if (dst_place.device == dev_id && src_place.device == dev_id) { - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - auto* dev_ctx = pool.GetByPlace(src_place); - PADDLE_ENFORCE_XDNN_SUCCESS( - xpu::copy(dev_ctx->x_context(), static_cast(src), - static_cast(dst), count), - "copy "); - } else { - PADDLE_ENFORCE_XPU_SUCCESS( - xpu_memcpy_peer(dst_place.device, dst, src_place.device, src, count)); - } + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto* dev_ctx = pool.GetByPlace(src_place); + pten::backends::xpu::MemcpySyncD2D(dst, dst_place, src, src_place, count, + *dev_ctx); } /**************************** Others **************************/ -XPUVersion get_xpu_version(int dev_id) { - uint64_t v = 0; - PADDLE_ENFORCE_XPU_SUCCESS(xpu_device_get_attr(&v, XPUATTR_MODEL, dev_id)); - - if (v == K100 || v == K200) { - VLOG(1) << "KUNLUN device " << dev_id << " is XPU1\n"; - return XPU1; - } else { - VLOG(1) << "KUNLUN device " << dev_id << " is XPU2\n"; - return XPU2; - } +pten::backends::xpu::XPUVersion get_xpu_version(int dev_id) { + return pten::backends::xpu::get_xpu_version(dev_id); } } // namespace platform diff --git a/paddle/fluid/platform/device/xpu/xpu_info.h b/paddle/fluid/platform/device/xpu/xpu_info.h index 220bebb9e6b055..03082e8dc50eca 100644 --- a/paddle/fluid/platform/device/xpu/xpu_info.h +++ b/paddle/fluid/platform/device/xpu/xpu_info.h @@ -13,6 +13,7 @@ limitations under the License. */ #ifdef PADDLE_WITH_XPU #include #include "paddle/fluid/platform/place.h" +#include "paddle/pten/backends/xpu/xpu_info.h" namespace paddle { namespace platform { @@ -50,31 +51,9 @@ void MemcpySyncD2D(void *dst, const platform::XPUPlace &dst_place, const void *src, const platform::XPUPlace &src_place, size_t count); -class XPUDeviceGuard { - public: - explicit inline XPUDeviceGuard(int dev_id) { - int prev_id = platform::GetXPUCurrentDeviceId(); - if (prev_id != dev_id) { - prev_id_ = prev_id; - platform::SetXPUDeviceId(dev_id); - } - } +using XPUDeviceGuard = pten::backends::xpu::XPUDeviceGuard; - inline ~XPUDeviceGuard() { - if (prev_id_ != -1) { - platform::SetXPUDeviceId(prev_id_); - } - } - - XPUDeviceGuard(const XPUDeviceGuard &o) = delete; - XPUDeviceGuard &operator=(const XPUDeviceGuard &o) = delete; - - private: - int prev_id_{-1}; -}; - -enum XPUVersion { XPU1, XPU2 }; -XPUVersion get_xpu_version(int dev_id); +pten::backends::xpu::XPUVersion get_xpu_version(int dev_id); } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/device/xpu/xpu_op_list.cc b/paddle/fluid/platform/device/xpu/xpu_op_list.cc index 36be4a55d0a6f1..e9b494024bd699 100644 --- a/paddle/fluid/platform/device/xpu/xpu_op_list.cc +++ b/paddle/fluid/platform/device/xpu/xpu_op_list.cc @@ -24,7 +24,7 @@ namespace platform { bool is_xpu_support_op(const std::string& op_name, const pOpKernelType& type) { auto& ops = get_kl1_ops(); auto v = get_xpu_version(type.place_.device); - if (v == XPU2) { + if (v == pten::backends::xpu::XPUVersion::XPU2) { ops = get_kl2_ops(); } @@ -74,10 +74,11 @@ bool is_in_xpu_black_list(const std::string& op_name) { return false; } -std::vector get_xpu_op_support_type(const std::string& op_name, - XPUVersion version) { +std::vector get_xpu_op_support_type( + const std::string& op_name, pten::backends::xpu::XPUVersion version) { std::vector res; - auto& ops = version == XPU1 ? get_kl1_ops() : get_kl2_ops(); + auto& ops = version == pten::backends::xpu::XPUVersion::XPU1 ? get_kl1_ops() + : get_kl2_ops(); if (ops.find(op_name) != ops.end()) { XPUKernelSet& type_set = ops[op_name]; for (auto& item : type_set) { @@ -87,9 +88,10 @@ std::vector get_xpu_op_support_type(const std::string& op_name, return res; } -XPUOpListMap get_xpu_op_list(XPUVersion version) { +XPUOpListMap get_xpu_op_list(pten::backends::xpu::XPUVersion version) { XPUOpListMap res; - auto& ops = version == XPU1 ? get_kl1_ops() : get_kl2_ops(); + auto& ops = version == pten::backends::xpu::XPUVersion::XPU1 ? get_kl1_ops() + : get_kl2_ops(); for (auto& op : ops) { std::vector op_vartypes; for (auto& item : op.second) { diff --git a/paddle/fluid/platform/device/xpu/xpu_op_list.h b/paddle/fluid/platform/device/xpu/xpu_op_list.h index 3672d68492a6f5..4c3eb097a147ee 100644 --- a/paddle/fluid/platform/device/xpu/xpu_op_list.h +++ b/paddle/fluid/platform/device/xpu/xpu_op_list.h @@ -27,9 +27,9 @@ using XPUOpListMap = bool is_xpu_support_op(const std::string& op_name, const pOpKernelType& type); bool is_in_xpu_black_list(const std::string& op_name); -std::vector get_xpu_op_support_type(const std::string& op_name, - XPUVersion version); -XPUOpListMap get_xpu_op_list(XPUVersion version); +std::vector get_xpu_op_support_type( + const std::string& op_name, pten::backends::xpu::XPUVersion version); +XPUOpListMap get_xpu_op_list(pten::backends::xpu::XPUVersion version); } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index 6ffeaf101feca7..bfb1f572068e0a 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -246,52 +246,14 @@ IPUDeviceContext::~IPUDeviceContext() {} #endif #ifdef PADDLE_WITH_XPU -XPUDeviceContext::XPUDeviceContext() { - context_ = xpu::create_context(); - xpu_version_ = get_xpu_version(place_.device); -} +XPUDeviceContext::XPUDeviceContext() : pten::XPUContext() {} XPUDeviceContext::~XPUDeviceContext() {} -XPUDeviceContext::XPUDeviceContext(XPUPlace place) : place_(place) { - platform::XPUDeviceGuard guard(place.device); - +XPUDeviceContext::XPUDeviceContext(XPUPlace place) : pten::XPUContext(place) { LOG_FIRST_N(WARNING, 1) << "Please NOTE: xpu device: " - << static_cast(place_.device); - - context_ = xpu::create_context(); - const int MAX_XPU_NUM = 16; - static void* l3ptrs[MAX_XPU_NUM] = {nullptr}; - - int l3_size = 13.5 * 1024 * 1024; - if (std::getenv("XPU_PADDLE_L3_SIZE") != nullptr) { - l3_size = atoi(std::getenv("XPU_PADDLE_L3_SIZE")); - } - - auto selected_xpus = GetXPUSelectedDevices(); - for (unsigned int i = 0; i < selected_xpus.size(); i++) { - if (place.device == selected_xpus[i]) { - if (l3ptrs[place.device] == nullptr) { - xpu_malloc(static_cast(&l3ptrs[place.device]), l3_size, - XPU_MEM_L3); - } - if (l3ptrs[place.device] != nullptr) { - context_->_l3_mgr.set(l3ptrs[place.device], l3_size); - VLOG(3) << "xpu place " << place.device << " set l3 size " << l3_size; - } - break; - } - } + << static_cast(place.device); } - -void XPUDeviceContext::Wait() const { - platform::SetXPUDeviceId(place_.device); - xpu_wait(context_->xpu_stream); -} - -Place XPUDeviceContext::GetPlace() const { return place_; } - -xpu::Context* XPUDeviceContext::x_context() const { return context_; } #endif #ifdef PADDLE_WITH_ASCEND_CL diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index 78c09dca5b4886..52f17cd986ce2f 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -78,6 +78,7 @@ struct GpuDevice; #ifdef PADDLE_WITH_XPU #include "paddle/fluid/platform/device/xpu/xpu_header.h" #include "paddle/fluid/platform/device/xpu/xpu_info.h" +#include "paddle/pten/backends/xpu/xpu_context.h" #endif #ifdef PADDLE_WITH_ASCEND_CL @@ -171,39 +172,12 @@ struct DefaultDeviceContextType; #ifdef PADDLE_WITH_XPU namespace xpu = baidu::xpu::api; -class XPUDeviceContext : public DeviceContext { +class XPUDeviceContext : public pten::XPUContext { public: XPUDeviceContext(); explicit XPUDeviceContext(XPUPlace place); virtual ~XPUDeviceContext(); Eigen::DefaultDevice* eigen_device() const { return nullptr; } - XPUVersion xpu_version() const { return xpu_version_; } - Place GetPlace() const override; - xpu::Context* x_context() const; - - /*! \brief Wait for all operations completion in the stream. */ - void Wait() const override; - -#ifdef PADDLE_WITH_XPU_BKCL - /*! \brief Return bkcl context. */ - BKCLContext_t bkcl_context() const { return bkcl_context_; } - - /*! \brief Set bkcl context. */ - void set_bkcl_context(BKCLContext_t context) { bkcl_context_ = context; } -#endif - - private: - XPUPlace place_; - XPUVersion xpu_version_; - xpu::Context* context_; -#ifdef PADDLE_WITH_XPU_BKCL - BKCLContext_t bkcl_context_; -#endif - - // Need to be the same with other DeviceContext, - // Eventhough eigen_device_ is not used in XPU - std::unique_ptr eigen_device_; - DISABLE_COPY_AND_ASSIGN(XPUDeviceContext); }; template <> diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index a3f0a0c87fd803..780ef741c6aca5 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -697,10 +697,10 @@ static void VarBaseCopy(std::shared_ptr &src, // NOLINT platform::DeviceContextPool::Instance().Get(src_device)->Wait(); } } - } else if (src->Var().IsType()) { - auto &src_selected_rows = src->Var().Get(); + } else if (src->Var().IsType()) { + auto &src_selected_rows = src->Var().Get(); auto *dst_selected_rows = - dst.MutableVar()->GetMutable(); + dst.MutableVar()->GetMutable(); dst_selected_rows->set_height(src_selected_rows.height()); dst_selected_rows->set_rows(src_selected_rows.rows()); framework::TensorCopy(src_selected_rows.value(), dst_device, @@ -1392,7 +1392,7 @@ void BindImperative(py::module *m_ptr) { PADDLE_ENFORCE_EQ( self.Var().IsType() || - self.Var().IsType(), + self.Var().IsType(), true, platform::errors::InvalidArgument( "Type of Tensor[%s] must be LoDTensor or SelectedRows!", @@ -1423,15 +1423,14 @@ void BindImperative(py::module *m_ptr) { detach_tensor->ShareInplaceVersionCounterWith(origin_tensor); } else { const auto &origin_selected_rows = - self.Var().Get(); + self.Var().Get(); PADDLE_ENFORCE_EQ( origin_selected_rows.value().IsInitialized(), true, platform::errors::InvalidArgument( "Tensor %s has not been initialized!", self.Name())); auto *detach_selected_rows = - detach_var->MutableVar() - ->GetMutable(); + detach_var->MutableVar()->GetMutable(); detach_selected_rows->set_height(origin_selected_rows.height()); detach_selected_rows->set_rows(origin_selected_rows.rows()); detach_selected_rows->mutable_value()->ShareDataWith( @@ -1597,7 +1596,7 @@ void BindImperative(py::module *m_ptr) { ? grad_var->MutableVar() ->GetMutable() : grad_var->MutableVar() - ->GetMutable() + ->GetMutable() ->mutable_value(); if (tensor->IsInitialized()) { @@ -1613,7 +1612,7 @@ void BindImperative(py::module *m_ptr) { }) .def("_is_sparse", [](imperative::VarBase &self) { - return self.Var().IsType(); + return self.Var().IsType(); }) .def("_allreduce", [](imperative::VarBase &self, @@ -1623,7 +1622,7 @@ void BindImperative(py::module *m_ptr) { #if NCCL_VERSION_CODE >= 2212 imperative::AllReduce(self.Var(), self.MutableVar(), strategy); #else - if (!self.Var().IsType()) { + if (!self.Var().IsType()) { imperative::AllReduce(self.Var(), self.MutableVar(), strategy); } else { PADDLE_THROW(platform::errors::Unimplemented( @@ -2126,11 +2125,10 @@ void BindImperative(py::module *m_ptr) { .Get() .dims()); } else if (self.Var() - .IsType< - framework::SelectedRows>()) { + .IsType()) { return framework::vectorize( self.Var() - .Get() + .Get() .value() .dims()); } else if (self.Var() diff --git a/paddle/fluid/pybind/io.cc b/paddle/fluid/pybind/io.cc index 88a43f9428b227..0bd1e94a09cdbe 100644 --- a/paddle/fluid/pybind/io.cc +++ b/paddle/fluid/pybind/io.cc @@ -49,35 +49,33 @@ void BindIO(pybind11::module *m) { return tellg; }); - m->def("save_selected_rows", - [](const paddle::framework::SelectedRows &selected_rows, - const std::string &str_file_name) { - std::ofstream fout(str_file_name, std::ios::binary); - PADDLE_ENFORCE_EQ( - static_cast(fout), true, - platform::errors::Unavailable( - "Cannot open %s to save SelectedRows.", str_file_name)); - - paddle::framework::SerializeToStream(fout, selected_rows); - int64_t tellp = fout.tellp(); - fout.close(); - return tellp; - }); + m->def("save_selected_rows", [](const pten::SelectedRows &selected_rows, + const std::string &str_file_name) { + std::ofstream fout(str_file_name, std::ios::binary); + PADDLE_ENFORCE_EQ( + static_cast(fout), true, + platform::errors::Unavailable("Cannot open %s to save SelectedRows.", + str_file_name)); - m->def("load_selected_rows", - [](paddle::framework::SelectedRows &selected_rows, - const std::string &str_file_name) { - std::ifstream fin(str_file_name, std::ios::binary); - PADDLE_ENFORCE_EQ( - static_cast(fin), true, - platform::errors::Unavailable( - "Cannot open %s to load SelectedRows.", str_file_name)); + paddle::framework::SerializeToStream(fout, selected_rows); + int64_t tellp = fout.tellp(); + fout.close(); + return tellp; + }); - paddle::framework::DeserializeFromStream(fin, &selected_rows); - int64_t tellg = fin.tellg(); - fin.close(); - return tellg; - }); + m->def("load_selected_rows", [](pten::SelectedRows &selected_rows, + const std::string &str_file_name) { + std::ifstream fin(str_file_name, std::ios::binary); + PADDLE_ENFORCE_EQ( + static_cast(fin), true, + platform::errors::Unavailable("Cannot open %s to load SelectedRows.", + str_file_name)); + + paddle::framework::DeserializeFromStream(fin, &selected_rows); + int64_t tellg = fin.tellg(); + fin.close(); + return tellg; + }); m->def("save_lod_tensor_to_memory", [](const paddle::framework::LoDTensor &tensor) -> py::bytes { @@ -93,14 +91,14 @@ void BindIO(pybind11::module *m) { }); m->def("save_selected_rows_to_memory", - [](const paddle::framework::SelectedRows &selected_rows) -> py::bytes { + [](const pten::SelectedRows &selected_rows) -> py::bytes { std::ostringstream ss; paddle::framework::SerializeToStream(ss, selected_rows); return ss.str(); }); m->def("load_selected_rows_from_memory", - [](paddle::framework::SelectedRows &selected_rows, + [](pten::SelectedRows &selected_rows, const std::string &selected_rows_bytes) { std::istringstream fin(selected_rows_bytes, std::ios::in | std::ios::binary); diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index e9774bd28611df..ad018944e43116 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -1217,23 +1217,27 @@ PYBIND11_MODULE(core_noavx, m) { })); #endif - py::class_(m, "SelectedRows") + py::class_(m, "SelectedRows") .def("__init__", - [](SelectedRows &instance) { new (&instance) SelectedRows(); }) + [](pten::SelectedRows &instance) { + new (&instance) pten::SelectedRows(); + }) .def("__init__", - [](SelectedRows &instance, const std::vector rows, + [](pten::SelectedRows &instance, const std::vector rows, const int64_t &height) { - new (&instance) SelectedRows(rows, height); + new (&instance) pten::SelectedRows(rows, height); }) .def("get_tensor", - [](SelectedRows &self) { return self.mutable_value(); }, + [](pten::SelectedRows &self) { return self.mutable_value(); }, py::return_value_policy::reference) .def("numel", - [](SelectedRows &self) -> int64_t { return self.value().numel(); }) - .def("set_height", &SelectedRows::set_height) - .def("height", &SelectedRows::height) + [](pten::SelectedRows &self) -> int64_t { + return self.value().numel(); + }) + .def("set_height", &pten::SelectedRows::set_height) + .def("height", &pten::SelectedRows::height) .def("set_rows", - [](SelectedRows &self, std::vector rows) { + [](pten::SelectedRows &self, std::vector rows) { #if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) self.set_rows(rows); #else @@ -1241,8 +1245,9 @@ PYBIND11_MODULE(core_noavx, m) { self.set_rows(new_rows); #endif }) - .def("sync_index", [](SelectedRows &instance) { instance.SyncIndex(); }) - .def("rows", [](SelectedRows &self) { + .def("sync_index", + [](pten::SelectedRows &instance) { instance.SyncIndex(); }) + .def("rows", [](pten::SelectedRows &self) { auto rows = self.rows(); std::vector new_rows; new_rows.reserve(rows.size()); @@ -1291,8 +1296,8 @@ All parameter, weight, gradient are variables in Paddle. [](Variable &self) { return self.GetMutable(); }, py::return_value_policy::reference) .def("get_selected_rows", - [](Variable &self) -> SelectedRows * { - return self.GetMutable(); + [](Variable &self) -> pten::SelectedRows * { + return self.GetMutable(); }, py::return_value_policy::reference) .def("get_lod_tensor_array", @@ -1757,27 +1762,30 @@ All parameter, weight, gradient are variables in Paddle. .def("__repr__", string::to_string) .def("__str__", string::to_string); #ifdef PADDLE_WITH_XPU - py::enum_(m, "XPUVersion", py::arithmetic()) - .value("XPU1", platform::XPUVersion::XPU1) - .value("XPU2", platform::XPUVersion::XPU2) + py::enum_(m, "XPUVersion", py::arithmetic()) + .value("XPU1", pten::backends::xpu::XPUVersion::XPU1) + .value("XPU2", pten::backends::xpu::XPUVersion::XPU2) .export_values(); m.def("get_xpu_device_count", platform::GetXPUDeviceCount); m.def("get_xpu_device_version", [](int device_id) { return platform::get_xpu_version(device_id); }); - m.def("get_xpu_device_op_support_types", - [](const std::string &op_name, platform::XPUVersion version) { - return platform::get_xpu_op_support_type(op_name, version); - }); - m.def("get_xpu_device_op_list", [](platform::XPUVersion version) { + m.def( + "get_xpu_device_op_support_types", + [](const std::string &op_name, pten::backends::xpu::XPUVersion version) { + return platform::get_xpu_op_support_type(op_name, version); + }); + m.def("get_xpu_device_op_list", [](pten::backends::xpu::XPUVersion version) { return platform::get_xpu_op_list(version); }); m.def("is_float16_supported", [](const platform::XPUPlace &place) -> bool { // XPUs with Compute Capability > xpu2 support float16 and bfloat16 - return platform::get_xpu_version(place.device) > platform::XPUVersion::XPU1; + return platform::get_xpu_version(place.device) > + pten::backends::xpu::XPUVersion::XPU1; }); m.def("is_bfloat16_supported", [](const platform::XPUPlace &place) -> bool { // XPUs with Compute Capability > xpu2 support float16 and bfloat16 - return platform::get_xpu_version(place.device) > platform::XPUVersion::XPU1; + return platform::get_xpu_version(place.device) > + pten::backends::xpu::XPUVersion::XPU1; }); #endif diff --git a/paddle/pten/api/lib/utils/tensor_utils.cc b/paddle/pten/api/lib/utils/tensor_utils.cc index e9f5ec2d05727a..971476a55db935 100644 --- a/paddle/pten/api/lib/utils/tensor_utils.cc +++ b/paddle/pten/api/lib/utils/tensor_utils.cc @@ -261,10 +261,10 @@ std::unique_ptr MakePtenTensorBaseFromVar( } else { return MakePtenDenseTensor(tensor); } - } else if (variable.IsType()) { + } else if (variable.IsType()) { // TODO(chenweihang): now we don't deal with row and height // by xiaowei's advice - const auto& tensor = variable.Get(); + const auto& tensor = variable.Get(); if (!platform::is_same_place(tensor.value().place(), expected_place)) { framework::Tensor tmp_tensor; paddle::framework::TensorCopySync( @@ -289,8 +289,8 @@ std::unique_ptr MakePtenTensorBaseFromVar( if (variable->template IsType()) { auto* tensor = variable->template GetMutable(); return MakePtenDenseTensor(*tensor, arg_def); - } else if (variable->template IsType()) { - auto* tensor = variable->template GetMutable(); + } else if (variable->template IsType()) { + auto* tensor = variable->template GetMutable(); // TODO(chenweihang): adapt SelectedRows by xiaowei's design, // here the row and height will lost in output! return MakePtenDenseTensor(tensor->value(), arg_def); @@ -389,8 +389,8 @@ void MakeVariableFromPtenTensor(pten::DenseTensor* src, tensor->set_type(dtype); } - } else if (variable->IsType()) { - auto* tensor = variable->GetMutable(); + } else if (variable->IsType()) { + auto* tensor = variable->GetMutable(); auto dtype = pten::TransToProtoVarType(src->dtype()); if (!tensor->value().IsInitialized()) { diff --git a/paddle/pten/backends/CMakeLists.txt b/paddle/pten/backends/CMakeLists.txt index 3587910ff506e5..e9f222d642ea04 100644 --- a/paddle/pten/backends/CMakeLists.txt +++ b/paddle/pten/backends/CMakeLists.txt @@ -2,4 +2,12 @@ add_subdirectory(dynload) add_subdirectory(cpu) -cc_library(pten_context SRCS all_context.cc DEPS device_context) +if(WITH_XPU) + add_subdirectory(xpu) +endif() + +cc_library(pten_context SRCS all_context.cc DEPS device_context cpu_context) + +if(WITH_XPU) + add_dependencies(pten_context xpu_context) +endif() diff --git a/paddle/pten/backends/cpu/cpu_context.cc b/paddle/pten/backends/cpu/cpu_context.cc index e749dfb9bd70e3..efce128596b812 100644 --- a/paddle/pten/backends/cpu/cpu_context.cc +++ b/paddle/pten/backends/cpu/cpu_context.cc @@ -18,16 +18,11 @@ // NOTE: The paddle framework should add WITH_EIGEN option to support compile // without eigen. -#include "paddle/pten/core/device_context.h" #include "unsupported/Eigen/CXX11/Tensor" namespace pten { struct CPUContext::CPUImpl { - Eigen::DefaultDevice* device_{nullptr}; - CPUContextResource res_; - CPUPlace place_; - CPUImpl() { device_ = new Eigen::DefaultDevice(); } // Users need to manage external resources. @@ -36,7 +31,7 @@ struct CPUContext::CPUImpl { } ~CPUImpl() { - if (res_.device == nullptr) { + if (res_.device == nullptr && device_ != nullptr) { delete device_; device_ = nullptr; } @@ -56,27 +51,28 @@ struct CPUContext::CPUImpl { } Place GetPlace() const { return place_; } + + Eigen::DefaultDevice* device_{nullptr}; + CPUContextResource res_; + CPUPlace place_; }; -CPUContext::CPUContext() : DeviceContext(), cpu_impl_(nullptr) { +CPUContext::CPUContext() : DeviceContext() { cpu_impl_ = std::make_unique(); } -CPUContext::CPUContext(const CPUContext& other) - : DeviceContext(), cpu_impl_(nullptr) { +CPUContext::CPUContext(const CPUContext& other) : DeviceContext() { cpu_impl_ = std::make_unique(); cpu_impl_->SetEigenDevice(other.eigen_device()); } -CPUContext::CPUContext(CPUContext&& other) - : DeviceContext(), cpu_impl_(nullptr) { +CPUContext::CPUContext(CPUContext&& other) : DeviceContext() { cpu_impl_ = std::move(other.cpu_impl_); } CPUContext::~CPUContext() = default; -CPUContext::CPUContext(const CPUContextResource& ctx_res) - : DeviceContext(), cpu_impl_(nullptr) { +CPUContext::CPUContext(const CPUContextResource& ctx_res) : DeviceContext() { cpu_impl_ = std::make_unique(ctx_res); } diff --git a/paddle/pten/backends/xpu/CMakeLists.txt b/paddle/pten/backends/xpu/CMakeLists.txt new file mode 100644 index 00000000000000..65341dd206fd30 --- /dev/null +++ b/paddle/pten/backends/xpu/CMakeLists.txt @@ -0,0 +1,2 @@ +cc_library(pten_xpu_info SRCS xpu_info.cc DEPS enforce xpulib pten_place) +cc_library(xpu_context SRCS xpu_context.cc DEPS pten_device_context pten_xpu_info) diff --git a/paddle/pten/backends/xpu/enforce_xpu.h b/paddle/pten/backends/xpu/enforce_xpu.h new file mode 100644 index 00000000000000..38aeff198d44bf --- /dev/null +++ b/paddle/pten/backends/xpu/enforce_xpu.h @@ -0,0 +1,194 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/pten/backends/xpu/xpu_header.h" +#include "xpu/bkcl.h" + +#include "paddle/fluid/platform/enforce.h" + +namespace pten { +namespace backends { +namespace xpu { + +// Note: XPU runtime api return int, not XPUError_t +inline const char* xpuGetErrorString(int stat) { + switch (stat) { + case XPU_SUCCESS: + return "Success"; + case XPUERR_INVALID_DEVICE: + return "Invalid XPU device"; + case XPUERR_UNINIT: + return "XPU runtime not properly inited"; + case XPUERR_NOMEM: + return "Device memory not enough"; + case XPUERR_NOCPUMEM: + return "CPU memory not enough"; + case XPUERR_INVALID_PARAM: + return "Invalid parameter"; + case XPUERR_NOXPUFUNC: + return "Cannot get XPU Func"; + case XPUERR_LDSO: + return "Error loading dynamic library"; + case XPUERR_LDSYM: + return "Error loading func from dynamic library"; + case XPUERR_SIMULATOR: + return "Error from XPU Simulator"; + case XPUERR_NOSUPPORT: + return "Operation not supported"; + case XPUERR_ABNORMAL: + return "Device abnormal due to previous error"; + case XPUERR_KEXCEPTION: + return "Exception in kernel execution"; + case XPUERR_TIMEOUT: + return "Kernel execution timed out"; + case XPUERR_BUSY: + return "Resource busy"; + case XPUERR_USEAFCLOSE: + return "Use a stream after closed"; + case XPUERR_UCECC: + return "Uncorrectable ECC"; + case XPUERR_OVERHEAT: + return "Overheat"; + case XPUERR_UNEXPECT: + return "Execution error, reach unexpected control flow"; + case XPUERR_DEVRESET: + return "Device is being reset, try again later"; + case XPUERR_HWEXCEPTION: + return "Hardware module exception"; + case XPUERR_HBM_INIT: + return "Error init HBM"; + case XPUERR_DEVINIT: + return "Error init device"; + case XPUERR_PEERRESET: + return "Device is being reset, try again later"; + case XPUERR_MAXDEV: + return "Device count exceed limit"; + case XPUERR_NOIOC: + return "Unknown IOCTL command"; + case XPUERR_DMATIMEOUT: + return "DMA timed out, a reboot maybe needed"; + case XPUERR_DMAABORT: + return "DMA aborted due to error, possibly wrong address or hardware " + "state"; + case XPUERR_MCUUNINIT: + return "Firmware not initialized"; + case XPUERR_OLDFW: + return "Firmware version too old (<15), please update."; + case XPUERR_PCIE: + return "Error in PCIE"; + case XPUERR_FAULT: + return "Error copy between kernel and user space"; + case XPUERR_INTERRUPTED: + return "Execution interrupted by user"; + default: + return "unkonwn error"; + } +} + +inline const char* bkclGetErrorString(BKCLResult_t stat) { + switch (stat) { + case BKCL_SUCCESS: + return "BKCL_SUCCESS"; + case BKCL_INVALID_ARGUMENT: + return "BKCL_INVALID_ARGUMENT"; + case BKCL_RUNTIME_ERROR: + return "BKCL_RUNTIME_ERROR"; + case BKCL_SYSTEM_ERROR: + return "BKCL_SYSTEM_ERROR"; + case BKCL_INTERNAL_ERROR: + return "BKCL_INTERNAL_ERROR"; + default: + return "Unknown BKCL status"; + } +} + +inline const char* xdnnGetErrorString(int stat) { + switch (stat) { + case baidu::xpu::api::Error_t::SUCCESS: + return "XDNN_SUCCESS"; + case baidu::xpu::api::Error_t::INVALID_PARAM: + return "XDNN_INVALID_PARAM"; + case baidu::xpu::api::Error_t::RUNTIME_ERROR: + return "XDNN_RUNTIME_ERROR"; + case baidu::xpu::api::Error_t::NO_ENOUGH_WORKSPACE: + return "XDNN_NO_ENOUGH_WORKSPACE"; + case baidu::xpu::api::Error_t::NOT_IMPLEMENT: + return "XDNN_NOT_IMPLEMENT"; + default: + return "Unknown XDNN status"; + } +} + +inline std::string build_xpu_error_msg(int stat) { + std::string msg("XPU Error <" + std::to_string(stat) + ">, "); + return msg + xpuGetErrorString(stat) + " "; +} + +inline std::string build_xpu_error_msg(BKCLResult_t stat) { + std::string msg("BKCL Error, "); + return msg + bkclGetErrorString(stat) + " "; +} + +inline std::string build_xpu_xdnn_error_msg(int stat, std::string msg) { + return msg + " XDNN Error, " + xdnnGetErrorString(stat) + " "; +} + +namespace details { + +template +struct ExternalApiType {}; + +#define DEFINE_EXTERNAL_API_TYPE(type, success_value) \ + template <> \ + struct ExternalApiType { \ + using Type = type; \ + static constexpr Type kSuccess = success_value; \ + } + +DEFINE_EXTERNAL_API_TYPE(int, XPU_SUCCESS); +DEFINE_EXTERNAL_API_TYPE(BKCLResult_t, BKCL_SUCCESS); + +#undef DEFINE_EXTERNAL_API_TYPE + +} // namespace details + +#define PADDLE_ENFORCE_XPU_SUCCESS(COND) \ + do { \ + auto __cond__ = (COND); \ + using __XPU_STATUS_TYPE__ = decltype(__cond__); \ + constexpr auto __success_type__ = \ + ::pten::backends::xpu::details::ExternalApiType< \ + __XPU_STATUS_TYPE__>::kSuccess; \ + if (UNLIKELY(__cond__ != __success_type__)) { \ + auto __summary__ = paddle::platform::errors::External( \ + ::pten::backends::xpu::build_xpu_error_msg(__cond__)); \ + __THROW_ERROR_INTERNAL__(__summary__); \ + } \ + } while (0) + +#define PADDLE_ENFORCE_XDNN_SUCCESS(COND, MSG) \ + do { \ + auto __cond__ = (COND); \ + if (UNLIKELY(__cond__ != baidu::xpu::api::Error_t::SUCCESS)) { \ + auto __summary__ = paddle::platform::errors::External( \ + ::pten::backends::xpu::build_xpu_xdnn_error_msg(__cond__, MSG)); \ + __THROW_ERROR_INTERNAL__(__summary__); \ + } \ + } while (0) + +} // namespace xpu +} // namespace backends +} // namespace pten diff --git a/paddle/pten/backends/xpu/forwards.h b/paddle/pten/backends/xpu/forwards.h new file mode 100644 index 00000000000000..805a74865b6d8c --- /dev/null +++ b/paddle/pten/backends/xpu/forwards.h @@ -0,0 +1,28 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +// Forward-declares. +#pragma once + +// Forward declaration of xpu context. +namespace baidu { +namespace xpu { +namespace api { + +struct Context; +typedef void* BKCLContext_t; + +} // namespace api +} // namespace xpu +} // namespace baidu diff --git a/paddle/pten/backends/xpu/xpu_context.cc b/paddle/pten/backends/xpu/xpu_context.cc new file mode 100644 index 00000000000000..af4478662a53b8 --- /dev/null +++ b/paddle/pten/backends/xpu/xpu_context.cc @@ -0,0 +1,169 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/pten/backends/xpu/xpu_context.h" +#include +#include "paddle/pten/api/ext/exception.h" + +#include "xpu/runtime.h" +#include "xpu/runtime_ex.h" +#include "xpu/xdnn.h" + +namespace xpu = baidu::xpu::api; + +namespace pten { + +struct XPUContext::XPUImpl { + void SetL3Cache() { + const int MAX_XPU_NUM = 16; + static void* l3ptrs[MAX_XPU_NUM] = {nullptr}; + + int l3_size = 13.5 * 1024 * 1024; + if (std::getenv("XPU_PADDLE_L3_SIZE") != nullptr) { + l3_size = atoi(std::getenv("XPU_PADDLE_L3_SIZE")); + } + + auto selected_xpus = backends::xpu::GetXPUSelectedDevices(); + for (unsigned int i = 0; i < selected_xpus.size(); i++) { + if (place_.GetDeviceId() == selected_xpus[i]) { + if (l3ptrs[place_.GetDeviceId()] == nullptr) { + xpu_malloc(static_cast(&l3ptrs[place_.GetDeviceId()]), + l3_size, + XPU_MEM_L3); + } + if (l3ptrs[place_.GetDeviceId()] != nullptr) { + context_->_l3_mgr.set(l3ptrs[place_.GetDeviceId()], l3_size); + VLOG(3) << "xpu place " << place_.GetDeviceId() << " set l3 size " + << l3_size; + } + break; + } + } + } + + XPUImpl() { + context_ = xpu::create_context(); + xpu_version_ = backends::xpu::get_xpu_version(place_.device); + } + + explicit XPUImpl(XPUPlace place) : place_(place) { + backends::xpu::XPUDeviceGuard guard(place_.GetDeviceId()); + + LOG_FIRST_N(WARNING, 1) << "Please NOTE: xpu device: " + << static_cast(place_.device); + + context_ = xpu::create_context(); + xpu_version_ = backends::xpu::get_xpu_version(place_.device); + SetL3Cache(); + } + + // Users need to manage external resources. + explicit XPUImpl(const XPUContextResource& ctx_res, + const XPUPlace& place = XPUPlace(0)) + : res_(ctx_res), place_(place) { + context_ = res_.context; + xpu_version_ = backends::xpu::get_xpu_version(place_.device); + SetL3Cache(); + } + + ~XPUImpl() { + if (res_.context == nullptr && context_ != nullptr) { + xpu::destroy_context(context_); + context_ = nullptr; + } + } + + Place GetPlace() const { return place_; } + + backends::xpu::XPUVersion GetXpuVersion() const { return xpu_version_; } + + xpu::Context* GetXContext() const { + PD_CHECK(context_ != nullptr, "the xpu context is nullptr."); + return context_; + } + + xpu::BKCLContext_t GetBkclContext() const { return bkcl_context_; } + + void Wait() const { + backends::xpu::SetXPUDeviceId(place_.GetDeviceId()); + PD_CHECK(context_ != nullptr, "the xpu context is nullptr."); + xpu_wait(context_->xpu_stream); + } + + void SetXContext(xpu::Context* context) { + if (context == nullptr) { + return; + } + res_.context = context; + context_ = context; + } + + void SetBkclContext(xpu::BKCLContext_t context) { bkcl_context_ = context; } + + XPUContextResource res_; + XPUPlace place_; + backends::xpu::XPUVersion xpu_version_; + xpu::Context* context_{nullptr}; + // NOTE: Distributed communicator, distributed framework manages its + // resources, XPUContext only holds references. + xpu::BKCLContext_t bkcl_context_{nullptr}; +}; + +XPUContext::XPUContext() : DeviceContext() { + impl_ = std::make_unique(); +} + +XPUContext::XPUContext(const XPUPlace& place) { + impl_ = std::make_unique(place); +} + +XPUContext::XPUContext(const XPUContext& other) : DeviceContext() { + impl_ = std::make_unique(); + impl_->SetXContext(other.x_context()); + impl_->SetBkclContext(other.bkcl_context()); +} + +XPUContext::XPUContext(XPUContext&& other) : DeviceContext() { + impl_ = std::move(other.impl_); +} + +XPUContext::~XPUContext() = default; + +XPUContext::XPUContext(const XPUContextResource& ctx_res) : DeviceContext() { + impl_ = std::make_unique(ctx_res); +} + +Place XPUContext::GetPlace() const { return impl_->GetPlace(); } + +backends::xpu::XPUVersion XPUContext::xpu_version() const { + return impl_->GetXpuVersion(); +} + +xpu::Context* XPUContext::x_context() const { return impl_->GetXContext(); } + +xpu::BKCLContext_t XPUContext::bkcl_context() const { + return impl_->GetBkclContext(); +} + +void XPUContext::Wait() const { impl_->Wait(); } + +void XPUContext::set_x_context(xpu::Context* context) { + impl_->SetXContext(context); +} + +void XPUContext::set_bkcl_context(xpu::BKCLContext_t context) { + impl_->SetBkclContext(context); +} + +} // namespace pten diff --git a/paddle/pten/backends/xpu/xpu_context.h b/paddle/pten/backends/xpu/xpu_context.h index 94d2a1532f6365..4ae5786211dd21 100644 --- a/paddle/pten/backends/xpu/xpu_context.h +++ b/paddle/pten/backends/xpu/xpu_context.h @@ -14,13 +14,60 @@ limitations under the License. */ #pragma once -#ifdef PADDLE_WITH_XPU +#include +#include "paddle/pten/backends/xpu/forwards.h" +#include "paddle/pten/common/place.h" +#include "paddle/pten/core/device_context.h" -// See Note [ Why still include the fluid headers? ] -#include "paddle/fluid/platform/device_context.h" +#include "paddle/pten/backends/xpu/xpu_header.h" +#include "paddle/pten/backends/xpu/xpu_info.h" + +namespace xpu = baidu::xpu::api; namespace pten { -using XPUContext = paddle::platform::XPUDeviceContext; -} // namespace pten -#endif // PADDLE_WITH_XPU +struct XPUContextResource { + xpu::Context* context{nullptr}; +}; + +class XPUContext : public DeviceContext { + public: + // NOTE: DeviceContext hold resources. Used in training scenarios. + XPUContext(); + + explicit XPUContext(const XPUPlace&); + + // NOTE: Share the same underlying resources, please ensure that resources are + // not released. + XPUContext(const XPUContext&); + + XPUContext(XPUContext&&); + + virtual ~XPUContext(); + + Place GetPlace() const override; + + backends::xpu::XPUVersion xpu_version() const; + + xpu::Context* x_context() const; + + // Return bkcl context. + xpu::BKCLContext_t bkcl_context() const; + + // Wait for all operations completion in the stream. + void Wait() const override; + + public: + // NOTE: External users manage resources. Used in inference scenarios. + explicit XPUContext(const XPUContextResource&); + + void set_x_context(xpu::Context*); + + void set_bkcl_context(xpu::BKCLContext_t context); + + private: + struct XPUImpl; + std::unique_ptr impl_; +}; + +} // namespace pten diff --git a/paddle/pten/backends/xpu/xpu_header.h b/paddle/pten/backends/xpu/xpu_header.h new file mode 100644 index 00000000000000..99e4a06720f22b --- /dev/null +++ b/paddle/pten/backends/xpu/xpu_header.h @@ -0,0 +1,56 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#ifdef PADDLE_WITH_XPU +#include +#include +#include + +#include "paddle/fluid/platform/enforce.h" +#include "paddle/pten/common/bfloat16.h" +#include "paddle/pten/common/float16.h" + +#include "xpu/runtime.h" +#include "xpu/runtime_ex.h" +#include "xpu/xdnn.h" + +namespace xpu = baidu::xpu::api; + +static std::map XPUAPIErrorMsg = { + {xpu::Error_t::SUCCESS, "xpu api success"}, + {xpu::Error_t::INVALID_PARAM, "xpu api invalid param"}, + {xpu::Error_t::RUNTIME_ERROR, "xpu api runtime error"}, + {xpu::Error_t::NO_ENOUGH_WORKSPACE, "xpu api no enough workspace"}}; + +template +class XPUTypeTrait { + public: + using Type = T; +}; + +template <> +class XPUTypeTrait { + public: + using Type = float16; +}; + +template <> +class XPUTypeTrait { + public: + using Type = bfloat16; +}; + +#endif diff --git a/paddle/pten/backends/xpu/xpu_info.cc b/paddle/pten/backends/xpu/xpu_info.cc new file mode 100644 index 00000000000000..01d23be848bde8 --- /dev/null +++ b/paddle/pten/backends/xpu/xpu_info.cc @@ -0,0 +1,199 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include "paddle/pten/backends/xpu/xpu_info.h" + +#include +#include +#include + +#include "paddle/pten/backends/xpu/enforce_xpu.h" +#include "paddle/pten/backends/xpu/xpu_context.h" +#include "paddle/pten/backends/xpu/xpu_header.h" +#include "paddle/pten/common/place.h" + +// TODO(wilber): The pten computing library requires a component to manage +// flags. +#include "paddle/fluid/platform/flags.h" + +PADDLE_DEFINE_EXPORTED_string( + selected_xpus, + "", + "A list of device ids separated by comma, like: 0,1,2,3. " + "This option is useful when doing multi process training and " + "each process have only one device (XPU). If you want to use " + "all visible devices, set this to empty string. NOTE: the " + "reason of doing this is that we want to use P2P communication" + "between XPU devices, use XPU_VISIBLE_DEVICES can only use" + "share-memory only."); + +namespace pten { +class XPUContext; + +namespace backends { +namespace xpu { + +/**************************** Version Management **************************/ + +//! Get the version of XPU Driver +int GetDriverVersion() { + uint32_t driver_version_major = 0; + uint32_t driver_version_minor = 0; + PADDLE_ENFORCE_XPU_SUCCESS( + xpu_get_driver_version(&driver_version_major, &driver_version_minor)); + int driver_version = driver_version_major * 10 + driver_version_minor; + return driver_version; +} + +//! Get the version of XPU Runtime +int GetRuntimeVersion() { + uint32_t rumtime_version_major = 0; + uint32_t rumtime_version_minor = 0; + PADDLE_ENFORCE_XPU_SUCCESS( + xpu_get_runtime_version(&rumtime_version_major, &rumtime_version_minor)); + int runtime_version = rumtime_version_major * 10 + rumtime_version_minor; + return runtime_version; +} + +/**************************** Device Management **************************/ + +static int GetDeviceCountImpl() { + const auto* xpu_visible_devices = std::getenv("XPU_VISIBLE_DEVICES"); + if (xpu_visible_devices != nullptr) { + std::string xpu_visible_devices_str(xpu_visible_devices); + if (std::all_of(xpu_visible_devices_str.begin(), + xpu_visible_devices_str.end(), + [](char ch) { return ch == ' '; })) { + VLOG(2) << "XPU_VISIBLE_DEVICES is set to be empty. No XPU detected."; + return 0; + } + } + + int count = 0; + PADDLE_ENFORCE_XPU_SUCCESS(xpu_device_count(&count)); + return count; +} + +int GetXPUDeviceCount() { + static auto dev_cnt = GetDeviceCountImpl(); + return dev_cnt; +} + +int GetXPUCurrentDeviceId() { + int dev_id; + PADDLE_ENFORCE_XPU_SUCCESS(xpu_current_device(&dev_id)); + if (dev_id >= 64) { + // if dev_id >= 64, the device is a simulator device, -64 to get real dev_id + dev_id -= 64; + } + return dev_id; +} + +void SetXPUDeviceId(int id) { + PADDLE_ENFORCE_LT( + id, + GetXPUDeviceCount(), + paddle::platform::errors::InvalidArgument("id must less than XPU count")); + PADDLE_ENFORCE_XPU_SUCCESS(xpu_set_device(id)); +} + +static inline std::vector Split(std::string const& original, + char separator) { + std::vector results; + std::string token; + std::istringstream is(original); + while (std::getline(is, token, separator)) { + if (!token.empty()) { + results.push_back(token); + } + } + return results; +} + +//! Get a list of device ids from environment variable or use all. +std::vector GetXPUSelectedDevices() { + // use user specified XPUs in single-node multi-process mode. + std::vector devices; + if (!FLAGS_selected_xpus.empty()) { + auto devices_str = Split(FLAGS_selected_xpus, ','); + for (auto id : devices_str) { + devices.push_back(atoi(id.c_str())); + } + } else { + int count = GetXPUDeviceCount(); + for (int i = 0; i < count; ++i) { + devices.push_back(i); + } + } + return devices; +} + +/**************************** Memory Management **************************/ + +void MemcpySyncH2D(void* dst, + const void* src, + size_t count, + const pten::XPUPlace& dst_place) { + XPUDeviceGuard guard(dst_place.device); + PADDLE_ENFORCE_XPU_SUCCESS( + xpu_memcpy(dst, src, count, XPUMemcpyKind::XPU_HOST_TO_DEVICE)); +} + +void MemcpySyncD2H(void* dst, + const void* src, + size_t count, + const pten::XPUPlace& src_place, + const pten::XPUContext& dev_ctx) { + XPUDeviceGuard guard(src_place.GetDeviceId()); + dev_ctx.Wait(); + PADDLE_ENFORCE_XPU_SUCCESS( + xpu_memcpy(dst, src, count, XPUMemcpyKind::XPU_DEVICE_TO_HOST)); +} + +// if src.device == dst.device and you need sync , after call this function, +// need to call xpu_wait() +void MemcpySyncD2D(void* dst, + const pten::XPUPlace& dst_place, + const void* src, + const pten::XPUPlace& src_place, + size_t count, + const pten::XPUContext& dev_ctx) { + int dev_id = GetXPUCurrentDeviceId(); + if (dst_place.device == dev_id && src_place.device == dev_id) { + PADDLE_ENFORCE_XDNN_SUCCESS( + baidu::xpu::api::copy(dev_ctx.x_context(), + static_cast(src), + static_cast(dst), + count), + "copy "); + } else { + PADDLE_ENFORCE_XPU_SUCCESS( + xpu_memcpy_peer(dst_place.device, dst, src_place.device, src, count)); + } +} + +/**************************** Others **************************/ + +XPUVersion get_xpu_version(int dev_id) { + uint64_t v = 0; + PADDLE_ENFORCE_XPU_SUCCESS(xpu_device_get_attr(&v, XPUATTR_MODEL, dev_id)); + + if (v == K100 || v == K200) { + VLOG(1) << "KUNLUN device " << dev_id << " is XPU1\n"; + return XPU1; + } else { + VLOG(1) << "KUNLUN device " << dev_id << " is XPU2\n"; + return XPU2; + } +} + +} // namespace xpu +} // namespace backends +} // namespace pten diff --git a/paddle/pten/backends/xpu/xpu_info.h b/paddle/pten/backends/xpu/xpu_info.h new file mode 100644 index 00000000000000..8cf836ba16dc6a --- /dev/null +++ b/paddle/pten/backends/xpu/xpu_info.h @@ -0,0 +1,93 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once + +#include +#include +#include "paddle/pten/common/place.h" + +namespace pten { + +class XPUContext; + +namespace backends { +namespace xpu { + +/***** Version Management *****/ + +//! Get the version of XPU Driver +int GetDriverVersion(); + +//! Get the version of XPU Runtime +int GetRuntimeVersion(); + +/***** Device Management *****/ + +//! Get the total number of XPU devices in system. +int GetXPUDeviceCount(); + +//! Set the XPU device id for next execution. +void SetXPUDeviceId(int device_id); + +//! Get the current XPU device id in system. +int GetXPUCurrentDeviceId(); + +//! Get a list of device ids from environment variable or use all. +std::vector GetXPUSelectedDevices(); + +/***** Memory Management *****/ + +//! Copy memory from address src to dst synchronously. +void MemcpySyncH2D(void *dst, + const void *src, + size_t count, + const pten::XPUPlace &dst_place); +void MemcpySyncD2H(void *dst, + const void *src, + size_t count, + const pten::XPUPlace &src_place, + const pten::XPUContext &dev_ctx); +void MemcpySyncD2D(void *dst, + const pten::XPUPlace &dst_place, + const void *src, + const pten::XPUPlace &src_place, + size_t count, + const pten::XPUContext &dev_ctx); + +class XPUDeviceGuard { + public: + explicit inline XPUDeviceGuard(int dev_id) { + int prev_id = GetXPUCurrentDeviceId(); + if (prev_id != dev_id) { + prev_id_ = prev_id; + SetXPUDeviceId(dev_id); + } + } + + inline ~XPUDeviceGuard() { + if (prev_id_ != -1) { + SetXPUDeviceId(prev_id_); + } + } + + XPUDeviceGuard(const XPUDeviceGuard &o) = delete; + XPUDeviceGuard &operator=(const XPUDeviceGuard &o) = delete; + + private: + int prev_id_{-1}; +}; + +enum XPUVersion { XPU1, XPU2 }; +XPUVersion get_xpu_version(int dev_id); + +} // namespace xpu +} // namespace backends +} // namespace pten diff --git a/paddle/pten/core/CMakeLists.txt b/paddle/pten/core/CMakeLists.txt index e8678a5189450e..e89d2cd3b3c387 100644 --- a/paddle/pten/core/CMakeLists.txt +++ b/paddle/pten/core/CMakeLists.txt @@ -22,7 +22,7 @@ cc_library(lod_utils SRCS lod_utils.cc DEPS pten_enforce mixed_vector) cc_library(dense_tensor SRCS dense_tensor.cc DEPS convert_utils tensor_meta tensor_base) cc_library(pten_device_context SRCS device_context.cc DEPS tensor_base ) -cc_library(meta_tensor SRCS meta_tensor.cc DEPS tensor_base dense_tensor) +cc_library(meta_tensor SRCS meta_tensor.cc DEPS tensor_base tensor_meta dense_tensor) cc_library(infermeta_utils SRCS infermeta_utils.cc DEPS meta_tensor) cc_library(selected_rows SRCS selected_rows.cc DEPS dense_tensor mixed_vector enforce ddim) diff --git a/paddle/pten/core/dense_tensor.cc b/paddle/pten/core/dense_tensor.cc index b1a5015f010c20..7fb532e00feaa6 100644 --- a/paddle/pten/core/dense_tensor.cc +++ b/paddle/pten/core/dense_tensor.cc @@ -231,9 +231,14 @@ void DenseTensor::ResetHolder(const std::shared_ptr& holder) { "Only the offset is supported to zero when the holder is reset.")); if (holder_) { + // TODO(zyfncg): The change of static_cast<> in check will recover back + // when SetAllocationForOutputTenosr is deleted. + // Now the numel() may return -1, and will cast to a very large number when + // compare with a data with unsigned long type, this will make checking + // failed, so it's a temporary solution to deal with this problem. PADDLE_ENFORCE_LE( - numel() * SizeOf(dtype()) + meta_.offset, - holder->size(), + numel() * static_cast(SizeOf(dtype())), + static_cast(holder->size()), paddle::platform::errors::InvalidArgument( "The size of Holder is not enough to store the Tensor.")); } diff --git a/paddle/pten/core/device_context.cc b/paddle/pten/core/device_context.cc index 7b2c4a2cf170f1..7566b351bf6340 100644 --- a/paddle/pten/core/device_context.cc +++ b/paddle/pten/core/device_context.cc @@ -13,28 +13,45 @@ // limitations under the License. #include "paddle/pten/core/device_context.h" +#include "paddle/pten/api/ext/exception.h" namespace pten { struct DeviceContext::Impl { - Allocator* allocator_{nullptr}; - Impl() = default; ~Impl() = default; - void SetAllocator(Allocator* allocator) { allocator_ = allocator; } + void SetDeviceAllocator(Allocator* allocator) { + device_allocator_ = allocator; + } + + void SetHostAllocator(Allocator* allocator) { host_allocator_ = allocator; } + + const Allocator& GetDeviceAllocator() const { + PD_CHECK(device_allocator_ != nullptr, "the device_allocator is nullptr."); + return *device_allocator_; + } - const Allocator& GetAllocator() const { return *allocator_; } + const Allocator& GetHostAllocator() const { + PD_CHECK(host_allocator_ != nullptr, "the host_allocator is nullptr."); + return *host_allocator_; + } // TODO(Wilber): Add impl. It seems that tensorbase not have interface to // communicate with allocator. - void Alloc(TensorBase* tensor) {} + void HostAlloc(TensorBase* tensor) {} + void DeviceAlloc(TensorBase* tensor) {} + + Allocator* device_allocator_{nullptr}; + Allocator* host_allocator_{nullptr}; }; DeviceContext::DeviceContext() { impl_ = std::make_unique(); } DeviceContext::DeviceContext(const DeviceContext& other) { - impl_->SetAllocator(const_cast(&other.GetAllocator())); + impl_->SetDeviceAllocator( + const_cast(&other.GetDeviceAllocator())); + impl_->SetHostAllocator(const_cast(&other.GetHostAllocator())); } DeviceContext::DeviceContext(DeviceContext&& other) { @@ -43,14 +60,26 @@ DeviceContext::DeviceContext(DeviceContext&& other) { DeviceContext::~DeviceContext() = default; -void DeviceContext::SetAllocator(Allocator* allocator) { - impl_->SetAllocator(allocator); +void DeviceContext::SetHostAllocator(Allocator* allocator) { + impl_->SetHostAllocator(allocator); +} + +void DeviceContext::SetDeviceAllocator(Allocator* allocator) { + impl_->SetDeviceAllocator(allocator); +} + +const Allocator& DeviceContext::GetHostAllocator() const { + return impl_->GetHostAllocator(); } -const Allocator& DeviceContext::GetAllocator() const { - return impl_->GetAllocator(); +const Allocator& DeviceContext::GetDeviceAllocator() const { + return impl_->GetDeviceAllocator(); } -void DeviceContext::Alloc(TensorBase* tensor) { impl_->Alloc(tensor); } +void DeviceContext::HostAlloc(TensorBase* tensor) { impl_->HostAlloc(tensor); } + +void DeviceContext::DeviceAlloc(TensorBase* tensor) { + impl_->DeviceAlloc(tensor); +} } // namespace pten diff --git a/paddle/pten/core/device_context.h b/paddle/pten/core/device_context.h index 1ee2e21494bf54..c658a24c3527d5 100644 --- a/paddle/pten/core/device_context.h +++ b/paddle/pten/core/device_context.h @@ -57,19 +57,38 @@ class DeviceContext { * * @param allocator */ - void SetAllocator(Allocator*); + void SetDeviceAllocator(Allocator*); /** - * @brief Get the const Allocator object. + * @brief Get the const deveice-releated Allocator object. * * @return Allocator */ - const Allocator& GetAllocator() const; + const Allocator& GetDeviceAllocator() const; /** - * @brief Allocate memory for tensor. + * @brief Allocate device memory for tensor. */ - void Alloc(pten::TensorBase*); + void DeviceAlloc(pten::TensorBase*); + + /** + * @brief Set the host Allocator object. + * + * @param allocator + */ + void SetHostAllocator(Allocator*); + + /** + * @brief Get the const host Allocator object. + * + * @return Allocator + */ + const Allocator& GetHostAllocator() const; + + /** + * @brief Allocate host memory for tensor. + */ + void HostAlloc(pten::TensorBase*); // TODO(wilber): Just for the convenience of migrating the code, it will be // modified or removed later. diff --git a/paddle/pten/tests/core/test_selected_rows.cc b/paddle/pten/tests/core/test_selected_rows.cc index 81c7ff4a838a70..c6e52ff64eab90 100644 --- a/paddle/pten/tests/core/test_selected_rows.cc +++ b/paddle/pten/tests/core/test_selected_rows.cc @@ -40,7 +40,7 @@ class SelectedRowsTester : public ::testing::Test { protected: pten::CPUPlace place_; - std::unique_ptr selected_rows_{nullptr}; + std::unique_ptr selected_rows_{nullptr}; }; TEST_F(SelectedRowsTester, height) { ASSERT_EQ(selected_rows_->height(), 10); } diff --git a/python/paddle/distributed/auto_parallel/reshard.py b/python/paddle/distributed/auto_parallel/reshard.py index 6e6d2a672fd186..da0f2ebcba89ef 100644 --- a/python/paddle/distributed/auto_parallel/reshard.py +++ b/python/paddle/distributed/auto_parallel/reshard.py @@ -279,7 +279,7 @@ def _is_overlapped(shape_x, shape_y): return overlapped -def _need_reshard(dist_tensor, dist_op): +def _need_reshard(dist_tensor, dist_op, op_input=True): """Judge the tensor whether needs to be resharded.""" is_reshard = False tensor_dist_attr = dist_tensor.dist_attr @@ -289,13 +289,31 @@ def _need_reshard(dist_tensor, dist_op): op_dist_attr = dist_op.dist_attr op_input_dims_mapping = op_dist_attr.get_input_dims_mapping(tensor_name) op_process_mesh = op_dist_attr.process_mesh - if all( - map(lambda x: x is not None, [ - tensor_dims_mapping, tensor_process_mesh, op_input_dims_mapping, - op_process_mesh - ])): - if tensor_dims_mapping != op_input_dims_mapping or tensor_process_mesh != op_process_mesh: - is_reshard = True + if op_input: + op_input_dims_mapping = op_dist_attr.get_input_dims_mapping(tensor_name) + op_process_mesh = op_dist_attr.process_mesh + if all( + map(lambda x: x is not None, [ + tensor_dims_mapping, tensor_process_mesh, + op_input_dims_mapping, op_process_mesh + ])): + if tensor_dims_mapping != op_input_dims_mapping or tensor_process_mesh != op_process_mesh: + is_reshard = True + else: + op_output_dims_mapping = op_dist_attr.get_output_dims_mapping( + tensor_name) + op_process_mesh = op_dist_attr.process_mesh + if all( + map(lambda x: x is not None, [ + tensor_dims_mapping, tensor_process_mesh, + op_output_dims_mapping, op_process_mesh + ])): + if tensor_process_mesh != op_process_mesh: + is_reshard = True + if tensor_dims_mapping != op_output_dims_mapping: + raise ValueError( + "It is not supported that tensor dims mapping is different from op output dims mapping." + ) return is_reshard @@ -948,12 +966,13 @@ def remove_no_need_in_startup(auto_parallel_main_prog, def reshard(auto_parallel_main_prog, auto_parallel_startup_prog, rank_id, dist_context): """ - Reshard tensor in the program according to its dist attr and corresponding op dist attr. + Reshard tensor in the program according to its distributed attribute and corresponding op distributed attribute. Args: auto_parallel_main_prog (Program): An auto parallel main program. auto_parallel_startup_prog (Program): An auto parallel startup program. rank_id (int): The process id. + dist_context (DistributedContext): The distributed context of this rank. """ assert isinstance(auto_parallel_main_prog, Program), "The type of auto_parallel_main_prog should be Program, " \ "but got {}.".format(type(auto_parallel_main_prog)) @@ -1001,6 +1020,34 @@ def _is_special_op(op): else: idx += 1 + # insert send and recv op if output process mesh is different from tensor process mesh + idx = 0 + skip_ops = ["create_py_reader", "create_double_buffer_reader", "read"] + while idx < len(block.ops): + pre_op_count = len(block.ops) + op = block.ops[idx] + dist_op = dist_context.get_dist_op_for_program(op) + if dist_op is not None and op.type not in skip_ops: + for var_name in op.output_arg_names: + var = block.vars[var_name] + dist_tensor = dist_context.get_dist_tensor_for_program(var) + if dist_tensor is not None and _need_reshard(dist_tensor, + dist_op, False): + for index, item in enumerate( + dist_op.dist_attr.process_mesh.processes): + recv_rank = dist_tensor.dist_attr.process_mesh.processes[ + index] + if rank_id == item: + _insert_send_op(block, idx + 1, var, recv_rank) + if rank_id == recv_rank: + _insert_recv_op(block, idx + 1, var, item) + cur_op_count = len(block.ops) + idx_offset = idx_offset + cur_op_count - pre_op_count + pre_op_count = cur_op_count + idx = idx + idx_offset + 1 + else: + idx += 1 + # remove no need vars and ops in the main program remove_no_need_in_main(auto_parallel_main_prog, dist_context, rank_id) diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard.py index b234e25823f4b3..a93abd3c127768 100644 --- a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard.py +++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard.py @@ -27,7 +27,7 @@ from paddle.distributed import fleet from paddle.distributed.auto_parallel.parallelizer import AutoParallelizer from paddle.distributed.auto_parallel.partitioner import Partitioner -from paddle.distributed.auto_parallel.reshard import reshard +from paddle.distributed.auto_parallel.reshard import reshard, HAS_SENT, HAS_RECV, HAS_ALLGATHER from paddle.distributed.auto_parallel.process_group import _g_process_group_map from paddle.distributed.auto_parallel.utils import print_program_with_dist_attr @@ -143,7 +143,11 @@ def mlp_forward(train_program, start_program): return loss, train_program, start_program -def get_dist_prog(train_program, startup_program, dist_context, rank_id): +def get_dist_prog(train_program, + startup_program, + dist_context, + rank_id, + change_process_mesh=False): loss, train_program, startup_program = mlp_forward(train_program, startup_program) @@ -157,6 +161,12 @@ def get_dist_prog(train_program, startup_program, dist_context, rank_id): complete_train_program = completer.complete_forward_annotation( train_program) + if change_process_mesh: + global PP_MESH_1 + dist_context.get_tensor_dist_attr_for_program( + train_program.global_block().vars[ + "gelu_0.tmp_0"]).process_mesh = PP_MESH_1 + params_grads = parallelizer._generate_backward( complete_train_program, startup_program, @@ -308,6 +318,25 @@ def test_mlp_pp(self): # parameter initialization of every rank should be different in the pipeline scene self.assertTrue(check_initialization(dist_startup_prog, rank_id)) + def test_mlp_pp_diff_process_mesh(self): + HAS_SENT.clear() + HAS_RECV.clear() + HAS_ALLGATHER.clear() + train_program = paddle.static.Program() + startup_program = paddle.static.Program() + dist_context = DistributedContext() + rank_id = 1 + dist_main_prog, dist_startup_prog = get_dist_prog( + train_program, startup_program, dist_context, rank_id, True) + for key in list(_g_process_group_map.keys()): + del _g_process_group_map[key] + reshard(dist_main_prog, dist_startup_prog, rank_id, dist_context) + print_program_with_dist_attr(dist_main_prog, dist_context) + + # check send and recv result + self.assertTrue(check_send_recv_result(dist_main_prog, rank_id)) + self.assertTrue(check_initialization(dist_startup_prog, rank_id)) + def test_mlp_dp(self): global _global_parallel_strategy _global_parallel_strategy = "dp" diff --git a/python/paddle/fluid/tests/unittests/test_eager_trace_op.py b/python/paddle/fluid/tests/unittests/test_eager_trace_op.py index 79de17fdb66412..b67dbd0ba622d9 100644 --- a/python/paddle/fluid/tests/unittests/test_eager_trace_op.py +++ b/python/paddle/fluid/tests/unittests/test_eager_trace_op.py @@ -41,9 +41,6 @@ def test_branches(self): paddle.fluid.framework._dygraph_tracer().trace_op( 'instance_norm', {'Scale': [scale], 'X': [x]}, {'Y': [x]}, {}) - paddle.fluid.framework._dygraph_tracer().trace_op( - 'coalesce_tensor', {'Input': [x]}, {'Output': [x]}, - {'dtype': int(core.VarDesc.VarType.FP32)}) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/test_norm_all.py b/python/paddle/fluid/tests/unittests/test_norm_all.py index 352089e1fb75fa..b20305b78efe2d 100644 --- a/python/paddle/fluid/tests/unittests/test_norm_all.py +++ b/python/paddle/fluid/tests/unittests/test_norm_all.py @@ -19,11 +19,12 @@ from op_test import OpTest import paddle import paddle.fluid as fluid +import paddle.fluid.core as core -def p_norm(x, axis, porder, keepdims=False): +def p_norm(x, axis, porder, keepdims=False, reduce_all=False): r = [] - if axis is None: + if axis is None or reduce_all: x = x.flatten() if porder == np.inf: r = np.amax(np.abs(x), keepdims=keepdims) @@ -53,8 +54,8 @@ def p_norm(x, axis, porder, keepdims=False): else: if isinstance(axis, list): axis = tuple(axis) - r = np.linalg.norm( - x, ord=porder, axis=axis, keepdims=keepdims).astype(x.dtype) + r = np.linalg.norm(x, ord=porder, axis=axis, keepdims=keepdims) + r = r.astype(x.dtype) return r @@ -111,13 +112,14 @@ def setUp(self): self.op_type = "p_norm" self.init_test_case() x = (np.random.random(self.shape) + 0.5).astype(self.dtype) - norm = p_norm(x, self.axis, self.porder, self.keepdim) + norm = p_norm(x, self.axis, self.porder, self.keepdim, self.asvector) self.inputs = {'X': x} self.attrs = { 'epsilon': self.epsilon, 'axis': self.axis, 'keepdim': self.keepdim, - 'porder': float(self.porder) + 'porder': float(self.porder), + 'asvector': self.asvector } self.outputs = {'Out': norm} self.gradient = self.calc_gradient() @@ -135,34 +137,42 @@ def init_test_case(self): self.porder = 2.0 self.keepdim = False self.dtype = "float64" + self.asvector = False def calc_gradient(self): self.attrs = { 'epsilon': self.epsilon, 'axis': self.axis, 'keepdim': self.keepdim, - 'porder': float(self.porder) + 'porder': float(self.porder), + 'asvector': self.asvector } x = self.inputs["X"] porder = self.attrs["porder"] axis = self.attrs["axis"] + asvector = self.attrs["asvector"] + x_dtype = x.dtype + x = x.astype(np.float32) if x.dtype == np.float16 else x if porder == 0: grad = np.zeros(x.shape).astype(x.dtype) elif porder in [float("inf"), float("-inf")]: - norm = p_norm(x, axis=axis, porder=porder, keepdims=True) + norm = p_norm( + x, axis=axis, porder=porder, keepdims=True, reduce_all=asvector) x_abs = np.abs(x) grad = np.sign(x) grad[x_abs != norm] = 0.0 else: - norm = p_norm(x, axis=axis, porder=porder, keepdims=True) + norm = p_norm( + x, axis=axis, porder=porder, keepdims=True, reduce_all=asvector) grad = np.power(norm, 1 - porder) * np.power( np.abs(x), porder - 1) * np.sign(x) numel = 1 for s in x.shape: numel *= s - numel /= x.shape[axis] - return [grad.astype(x.dtype) * 1 / numel] + divisor = numel if asvector else x.shape[axis] + numel /= divisor + return [grad.astype(x_dtype) * 1 / numel] class TestPnormOp2(TestPnormOp): @@ -173,6 +183,7 @@ def init_test_case(self): self.porder = 2.0 self.keepdim = True self.dtype = "float32" + self.asvector = False def test_check_grad(self): self.check_grad(['X'], 'Out') @@ -186,6 +197,7 @@ def init_test_case(self): self.porder = np.inf self.keepdim = True self.dtype = "float32" + self.asvector = False def test_check_grad(self): self.check_grad(['X'], 'Out', user_defined_grads=self.gradient) @@ -199,6 +211,7 @@ def init_test_case(self): self.porder = -np.inf self.keepdim = True self.dtype = "float32" + self.asvector = False def test_check_grad(self): self.check_grad(['X'], 'Out', user_defined_grads=self.gradient) @@ -212,11 +225,63 @@ def init_test_case(self): self.porder = 0 self.keepdim = True self.dtype = "float32" + self.asvector = False def test_check_grad(self): self.check_grad(['X'], 'Out', user_defined_grads=self.gradient) +class TestPnormOp6(TestPnormOp): + def init_test_case(self): + self.shape = [3, 20, 3] + self.axis = -1 + self.epsilon = 1e-12 + self.porder = 2 + self.keepdim = False + self.dtype = "float32" + self.asvector = True + + def test_check_grad(self): + self.check_grad(['X'], 'Out', user_defined_grads=self.gradient) + + +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestPnormOpFP16(TestPnormOp): + def init_test_case(self): + self.shape = [2, 3, 4, 5] + self.axis = 1 + self.epsilon = 1e-12 + self.porder = 2.0 + self.keepdim = False + self.dtype = "float16" + self.asvector = False + + def test_check_output(self): + place = core.CUDAPlace(0) + if core.is_float16_supported(place): + self.check_output_with_place(place, atol=1e-3) + + def test_check_grad(self): + place = core.CUDAPlace(0) + if core.is_float16_supported(place): + self.check_grad_with_place( + place, ['X'], 'Out', user_defined_grads=self.gradient) + + +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestPnormOpFP161(TestPnormOpFP16): + def init_test_case(self): + self.shape = [2, 3, 4, 5] + self.axis = -1 + self.epsilon = 1e-12 + self.porder = 2.0 + self.keepdim = False + self.dtype = "float16" + self.asvector = True + + def run_fro(self, p, axis, shape_x, dtype, keep_dim, check_dim=False): with fluid.program_guard(fluid.Program()): data = fluid.data(name="X", shape=shape_x, dtype=dtype)