Skip to content

Commit

Permalink
Fix Tree Reduction on new instance type p3dn.24xlarge (apache#13852)
Browse files Browse the repository at this point in the history
* add fallback for gpu topology detection using CUDA 9.2

* add fallback for gpu topology detection using CUDA 9.2

* add log

* update 3rdparty to master

* add fallback for gpu topology detection using CUDA 9.2

* add log

* update 3rdparty to master

* bring 3rdparty packages to upstream/master

* rebase to master

* Update gpu_topology.h
  • Loading branch information
ctcyang authored and eric-haibin-lin committed Jan 12, 2019
1 parent c4761d0 commit 0d76675
Show file tree
Hide file tree
Showing 2 changed files with 71 additions and 14 deletions.
16 changes: 8 additions & 8 deletions src/kvstore/comm_tree.h
Original file line number Diff line number Diff line change
Expand Up @@ -77,9 +77,6 @@ class CommDeviceTree : public CommDevice {
// BroadcastRowSparse
InitMergeBuffer(devs_);
InitMergeBufferTree();
if (dmlc::GetEnv("MXNET_ENABLE_GPU_P2P", 1)) {
EnableP2P();
}
}
}

Expand Down Expand Up @@ -328,7 +325,7 @@ class CommDeviceTree : public CommDevice {
}

private:
void EnableP2P() {
void EnableP2P(std::vector<int>* p2p) {
#if MXNET_USE_CUDA
std::vector<int> gpus;
for (const auto& d : devs_) {
Expand All @@ -338,7 +335,8 @@ class CommDeviceTree : public CommDevice {
}
int n = static_cast<int>(gpus.size());
int enabled = 0;
std::vector<int> p2p(n*n);
p2p->clear();
p2p->resize(n*n, 0);
for (int i = 0; i < n; ++i) {
mxnet::common::cuda::DeviceStore device_store(gpus[i]);
for (int j = 0; j < n; j++) {
Expand All @@ -348,7 +346,7 @@ class CommDeviceTree : public CommDevice {
cudaError_t e = cudaDeviceEnablePeerAccess(gpus[j], 0);
if (e == cudaSuccess || e == cudaErrorPeerAccessAlreadyEnabled) {
++enabled;
p2p[i*n+j] = 1;
(*p2p)[i*n+j] = 1;
}
}
}
Expand All @@ -362,7 +360,7 @@ class CommDeviceTree : public CommDevice {
std::string access(n, '.');
for (int i = 0; i < n; ++i) {
for (int j = 0; j < n; ++j) {
access[j] = p2p[i*n+j] ? 'v' : '.';
access[j] = (*p2p)[i*n+j] ? 'v' : '.';
}
LOG(WARNING) << access;
}
Expand All @@ -373,7 +371,9 @@ class CommDeviceTree : public CommDevice {
void QueryTopology() {
#if MXNET_USE_CUDA
std::vector<float> link_matrix(devs_.size()*devs_.size());
GetP2PWeight(devs_, &link_matrix);
std::vector<int> p2p_matrix(devs_.size()*devs_.size());
EnableP2P(&p2p_matrix);
GetP2PWeight(devs_, p2p_matrix, &link_matrix);
if (backtrack_)
LOG(INFO) << "Using Backtracking to generate trees";
else
Expand Down
69 changes: 63 additions & 6 deletions src/kvstore/gpu_topology.h
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,9 @@ inline bool IsConnected(const std::vector<T>& matrix, int num_gpus) {
/**
* \brief Generate adjacency matrix with row/col numbering from 0, 1, ..., n_gpu
* \param devs is a vector of GPU contexts
* \param p2p_matrix is adjacency matrix of P2P connections where
* 0: no P2P connection
* 1: P2P connection
* \param matrix is adjacency matrix of link topology graph
* where edge weight represents relative performance of NVIDIA GPUs
* 0: Self-connection
Expand All @@ -131,7 +134,9 @@ inline bool IsConnected(const std::vector<T>& matrix, int num_gpus) {
* 3: 2 NVLink connections
*/
template <typename T>
inline void GetP2PWeight(const std::vector<Context>& devs, std::vector<T>* matrix) {
inline void GetP2PWeight(const std::vector<Context>& devs,
const std::vector<int>& p2p_matrix,
std::vector<T>* matrix) {
int num_gpus = devs.size();
int count = 0;
std::vector<int> zero_dev_id(num_gpus, -1);
Expand Down Expand Up @@ -161,11 +166,61 @@ inline void GetP2PWeight(const std::vector<Context>& devs, std::vector<T>* matri
}
}

// Check that all GPUs have at least 1 NVLink connection
int max_value = 0;
for (unsigned int i = 0; i < max.size(); ++i) {
if (max[i] > max_value)
max_value = max[i];
// Check that all P2P connections are detected by GetP2PAttribute
// If yes, then continue as before
// If not, then treat fallback to using p2p_matrix (from EnableP2P)
//
// We have observed that with CUDA 9.0 p3.16xlarge:
//
// 0 2 2 3 3 1 1 1 . v v v v . . .
// 2 0 3 2 1 3 1 1 v . v v . v . .
// 2 3 0 3 1 1 2 1 v v . v . . v .
// 3 2 3 0 1 1 1 2 v v v . . . . v
// 3 1 1 1 0 2 2 3 v . . . . v v v
// 1 3 1 1 2 0 3 2 . v . . v . v v
// 1 1 2 1 2 3 0 3 . . v . v v . v
// 1 1 1 2 3 2 3 0 . . . v v v v .
//
// matrix p2p_matrix
//
// Here, they are correctly detected, because the 2s and 3s correspond to
// links that have P2P connections between them. However for CUDA 9.2 p3.16xlarge:
//
// 0 2 2 1 1 1 1 1 . v v v v . . .
// 2 0 1 2 1 1 1 1 v . v v . v . .
// 2 1 0 1 1 1 2 1 v v . v . . v .
// 1 2 1 0 1 1 1 2 v v v . . . . v
// 1 1 1 1 0 2 2 1 v . . . . v v v
// 1 1 1 1 2 0 1 2 . v . . v . v v
// 1 1 2 1 2 1 0 1 . . v . v v . v
// 1 1 1 2 1 2 1 0 . . . v v v v .
//
// matrix p2p_matrix
//
// The fastest connections (3 - double NVLink) are not recognized as being any
if (kLogTree) {
PrintMatrix("matrix", *matrix, num_gpus, num_gpus);
PrintMatrix("p2p_matrix", p2p_matrix, num_gpus, num_gpus);
}

// different from (1 - non-P2P PCI-E). This is why we fallback to p2p_matrix.
bool matrix_correct = true;
for (unsigned i = 0; i < p2p_matrix.size(); ++i) {
if (p2p_matrix[i] > 0 && (*matrix)[i] == 1) {
matrix_correct = false;
break;
}
}

if (!matrix_correct) {
LOG(WARNING) << "cudaDeviceGetP2PAttribute incorrect. "
<< "Falling back to cudaDeviceEnablePeerAccess for topology detection";
for (unsigned i = 0; i < p2p_matrix.size(); ++i) {
if (p2p_matrix[i] > 0)
(*matrix)[i] = 2;
else
(*matrix)[i] = 1;
}
}

// If all GPUs are connected by NVLink, then we can use NVLink only
Expand All @@ -188,6 +243,8 @@ inline void GetP2PWeight(const std::vector<Context>& devs, std::vector<T>* matri
matrix_value = (matrix_value == 1) ? 1./num_gpus : matrix_value;
}
}
if (kLogTree)
PrintMatrix("Weight", *matrix, num_gpus, num_gpus);

#else
LOG(WARNING) << "GPU required for link topology";
Expand Down

0 comments on commit 0d76675

Please sign in to comment.