Skip to content

Commit

Permalink
Merge branch 'develop' of /~https://github.com/PaddlePaddle/Paddle into…
Browse files Browse the repository at this point in the history
… fix_accumulation_node
  • Loading branch information
jim19930609 committed Feb 21, 2022
2 parents 9f04b85 + a863b32 commit d342316
Show file tree
Hide file tree
Showing 72 changed files with 1,940 additions and 882 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -176,4 +176,4 @@ TEST(Benchmark, EagerIntermediateMLPCPU) {
USE_OP_ITSELF(scale);
USE_OP_ITSELF(elementwise_add);
USE_OP_ITSELF(matmul_v2);
USE_OP(reduce_sum);
USE_OP_ITSELF(reduce_sum);
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ TEST(Benchmark, EagerIntermediateMLPCUDA) {

USE_OP_ITSELF(scale);
USE_OP_ITSELF(matmul_v2);
USE_OP(reduce_sum);
USE_OP_ITSELF(reduce_sum);
USE_OP(reduce_sum_grad);
USE_OP_ITSELF(elementwise_add);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -214,4 +214,4 @@ TEST(Benchmark, FluidMLPCPU) {
USE_OP_ITSELF(scale);
USE_OP_ITSELF(elementwise_add);
USE_OP_ITSELF(matmul_v2);
USE_OP(reduce_sum);
USE_OP_ITSELF(reduce_sum);
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,7 @@ TEST(Benchmark, FluidMLPCUDA) {

USE_OP_ITSELF(scale);
USE_OP_ITSELF(matmul_v2);
USE_OP(reduce_sum);
USE_OP_ITSELF(reduce_sum);
USE_OP(reduce_sum_grad);
USE_OP_ITSELF(elementwise_add);

Expand Down
8 changes: 5 additions & 3 deletions paddle/fluid/framework/details/scope_buffered_monitor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
// limitations under the License.

#include "paddle/fluid/framework/details/scope_buffered_monitor.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/platform/profiler/event_tracing.h"

namespace paddle {
namespace framework {
Expand Down Expand Up @@ -91,7 +91,8 @@ void ScopeBufferedMonitor::Apply(const std::function<void()> &callback,
bool has_fetch) {
std::unique_ptr<platform::RecordEvent> pre_local_exec_scopes_event(
new platform::RecordEvent(
"ScopeBufferedMonitor::pre_local_exec_scopes_process"));
"ScopeBufferedMonitor::pre_local_exec_scopes_process",
platform::TracerEventType::UserDefined, 2));
for (size_t scope_id = 0; scope_id < local_exec_scopes_.size(); ++scope_id) {
pre_local_exec_scopes_.at(scope_id).clear();
auto scopes = local_exec_scopes_.at(scope_id)->kids();
Expand All @@ -105,7 +106,8 @@ void ScopeBufferedMonitor::Apply(const std::function<void()> &callback,

std::unique_ptr<platform::RecordEvent> post_local_exec_scopes_event(
new platform::RecordEvent(
"ScopeBufferedMonitor::post_local_exec_scopes_process"));
"ScopeBufferedMonitor::post_local_exec_scopes_process",
platform::TracerEventType::UserDefined, 2));
for (size_t scope_id = 0; scope_id < local_exec_scopes_.size(); ++scope_id) {
post_local_exec_scopes_.at(scope_id).clear();
auto scopes = local_exec_scopes_.at(scope_id)->kids();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/variable_helper.h"
#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/platform/profiler/event_tracing.h"

namespace paddle {
namespace framework {
Expand Down Expand Up @@ -75,7 +75,8 @@ FetchResultType ScopeBufferedSSAGraphExecutor::Run(
#endif

if (drop_scope_counter_ == 0) {
platform::RecordEvent e("InitLocalVars");
platform::RecordEvent e("InitLocalVars",
platform::TracerEventType::UserDefined, 2);
InitVariables();
}

Expand Down Expand Up @@ -164,7 +165,8 @@ void ScopeBufferedSSAGraphExecutor::InitVariables() {
}

void ScopeBufferedSSAGraphExecutor::DropLocalExeScopes(bool need_wait) {
platform::RecordEvent drop_scope_event("DropLocalExeScopes");
platform::RecordEvent drop_scope_event(
"DropLocalExeScopes", platform::TracerEventType::UserDefined, 2);
drop_scope_counter_ = 0;
if (need_wait) {
for (auto &p : places_) {
Expand Down
7 changes: 5 additions & 2 deletions paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/platform/cuda_device_guard.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/platform/profiler/event_tracing.h"

DECLARE_bool(sync_nccl_allreduce);

Expand Down Expand Up @@ -66,7 +66,8 @@ SparseAllReduceOpHandle::SparseAllReduceOpHandle(
}

void SparseAllReduceOpHandle::RunImplEncoded() {
platform::RecordEvent record_event(Name());
platform::RecordEvent record_event(Name(),
platform::TracerEventType::UserDefined, 2);

auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
Expand Down Expand Up @@ -279,6 +280,8 @@ bool SparseAllReduceOpHandle::IsEncoded() {
}

void SparseAllReduceOpHandle::RunImpl() {
platform::RecordEvent record_event(
Name(), platform::TracerEventType::Communication, 1);
if (!IsEncoded()) {
AllReduceOpHandle::RunImpl();
return;
Expand Down
5 changes: 3 additions & 2 deletions paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
#include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"

#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/platform/profiler/event_tracing.h"

#if defined PADDLE_WITH_PSCORE
#include "paddle/fluid/distributed/ps/service/communicator/communicator.h"
Expand Down Expand Up @@ -56,7 +56,8 @@ ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor(
inline FetchResultType ThreadedSSAGraphExecutor::RunImpl(
const std::vector<std::string> &fetch_tensors, bool return_merged) {
std::unique_ptr<platform::RecordEvent> event(
new platform::RecordEvent("ThreadedSSAGraphExecutorPrepare"));
new platform::RecordEvent("ThreadedSSAGraphExecutorPrepare",
platform::TracerEventType::UserDefined, 2));
std::unique_ptr<OpDependentData> op_deps = op_deps_futures_.get();
CopyOpDeps();

Expand Down
1 change: 1 addition & 0 deletions paddle/fluid/framework/executor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ limitations under the License. */
#include "paddle/fluid/operators/controlflow/while_op_helper.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/platform/profiler/event_tracing.h"
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
#endif
Expand Down
2 changes: 2 additions & 0 deletions paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ IF(WITH_GPU)
nv_library(heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h hashtable.h mem_pool.h DEPS ${HETERPS_DEPS})
nv_test(test_heter_comm SRCS feature_value.h DEPS heter_comm)
nv_library(heter_ps SRCS heter_ps.cu DEPS heter_comm)
nv_library(graph_gpu_ps SRCS graph_gpu_ps_table.h DEPS heter_comm)
nv_test(test_graph_comm SRCS test_graph.cu DEPS graph_gpu_ps)
ENDIF()
IF(WITH_ROCM)
hip_library(heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h hashtable.h DEPS cub device_context)
Expand Down
144 changes: 144 additions & 0 deletions paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#pragma once
#include "heter_comm.h"
#include "paddle/fluid/platform/enforce.h"
#ifdef PADDLE_WITH_HETERPS
namespace paddle {
namespace framework {
struct GpuPsGraphNode {
int64_t node_id;
int neighbor_size, neighbor_offset;
// this node's neighbor is stored on [neighbor_offset,neighbor_offset +
// neighbor_size) of int64_t *neighbor_list;
};

struct GpuPsCommGraph {
int64_t *neighbor_list;
GpuPsGraphNode *node_list;
int neighbor_size, node_size;
// the size of neighbor array and graph_node_list array
GpuPsCommGraph()
: neighbor_list(NULL), node_list(NULL), neighbor_size(0), node_size(0) {}
GpuPsCommGraph(int64_t *neighbor_list_, GpuPsGraphNode *node_list_,
int neighbor_size_, int node_size_)
: neighbor_list(neighbor_list_),
node_list(node_list_),
neighbor_size(neighbor_size_),
node_size(node_size_) {}
};

/*
suppose we have a graph like this
0----3-----5----7
\ |\ |\
17 8 9 1 2
we save the nodes in arbitrary order,
in this example,the order is
[0,5,1,2,7,3,8,9,17]
let us name this array u_id;
we record each node's neighbors:
0:3,17
5:3,7
1:7
2:7
7:1,2,5
3:0,5,8,9
8:3
9:3
17:0
by concatenating each node's neighbor_list in the order we save the node id.
we get [3,17,3,7,7,7,1,2,5,0,5,8,9,3,3,0]
this is the neighbor_list of GpuPsCommGraph
given this neighbor_list and the order to save node id,
we know,
node 0's neighbors are in the range [0,1] of neighbor_list
node 5's neighbors are in the range [2,3] of neighbor_list
node 1's neighbors are in the range [4,4] of neighbor_list
node 2:[5,5]
node 7:[6,6]
node 3:[9,12]
node 8:[13,13]
node 9:[14,14]
node 17:[15,15]
...
by the above information,
we generate a node_list:GpuPsGraphNode *graph_node_list in GpuPsCommGraph
of size 9,
where node_list[i].id = u_id[i]
then we have:
node_list[0]-> node_id:0, neighbor_size:2, neighbor_offset:0
node_list[1]-> node_id:5, neighbor_size:2, neighbor_offset:2
node_list[2]-> node_id:1, neighbor_size:1, neighbor_offset:4
node_list[3]-> node_id:2, neighbor_size:1, neighbor_offset:5
node_list[4]-> node_id:7, neighbor_size:3, neighbor_offset:6
node_list[5]-> node_id:3, neighbor_size:4, neighbor_offset:9
node_list[6]-> node_id:8, neighbor_size:1, neighbor_offset:13
node_list[7]-> node_id:9, neighbor_size:1, neighbor_offset:14
node_list[8]-> node_id:17, neighbor_size:1, neighbor_offset:15
*/
struct NeighborSampleResult {
int64_t *val;
int *actual_sample_size, sample_size, key_size;
NeighborSampleResult(int _sample_size, int _key_size)
: sample_size(_sample_size), key_size(_key_size) {
actual_sample_size = NULL;
val = NULL;
};
~NeighborSampleResult() {
if (val != NULL) cudaFree(val);
if (actual_sample_size != NULL) cudaFree(actual_sample_size);
}
};

struct NodeQueryResult {
int64_t *val;
int actual_sample_size;
NodeQueryResult() {
val = NULL;
actual_sample_size = 0;
};
~NodeQueryResult() {
if (val != NULL) cudaFree(val);
}
};
class GpuPsGraphTable : public HeterComm<int64_t, int, int> {
public:
GpuPsGraphTable(std::shared_ptr<HeterPsResource> resource)
: HeterComm<int64_t, int, int>(1, resource) {
load_factor_ = 0.25;
}
void build_graph_from_cpu(std::vector<GpuPsCommGraph> &cpu_node_list);
NodeQueryResult *graph_node_sample(int gpu_id, int sample_size);
NeighborSampleResult *graph_neighbor_sample(int gpu_id, int64_t *key,
int sample_size, int len);
NodeQueryResult *query_node_list(int gpu_id, int start, int query_size);
void clear_graph_info();
void move_neighbor_sample_result_to_source_gpu(int gpu_id, int gpu_num,
int sample_size, int *h_left,
int *h_right,
int64_t *src_sample_res,
int *actual_sample_size);

private:
std::vector<GpuPsCommGraph> gpu_graph_list;
};
}
};
#include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.h"
#endif
Loading

0 comments on commit d342316

Please sign in to comment.