From 0d766750695ef7c0f3152a4f7556eb951ad24688 Mon Sep 17 00:00:00 2001 From: Carl Yang Date: Fri, 11 Jan 2019 22:15:58 -0800 Subject: [PATCH] Fix Tree Reduction on new instance type p3dn.24xlarge (#13852) * add fallback for gpu topology detection using CUDA 9.2 * add fallback for gpu topology detection using CUDA 9.2 * add log * update 3rdparty to master * add fallback for gpu topology detection using CUDA 9.2 * add log * update 3rdparty to master * bring 3rdparty packages to upstream/master * rebase to master * Update gpu_topology.h --- src/kvstore/comm_tree.h | 16 ++++----- src/kvstore/gpu_topology.h | 69 ++++++++++++++++++++++++++++++++++---- 2 files changed, 71 insertions(+), 14 deletions(-) diff --git a/src/kvstore/comm_tree.h b/src/kvstore/comm_tree.h index b62228cd2885..11d99c021917 100644 --- a/src/kvstore/comm_tree.h +++ b/src/kvstore/comm_tree.h @@ -77,9 +77,6 @@ class CommDeviceTree : public CommDevice { // BroadcastRowSparse InitMergeBuffer(devs_); InitMergeBufferTree(); - if (dmlc::GetEnv("MXNET_ENABLE_GPU_P2P", 1)) { - EnableP2P(); - } } } @@ -328,7 +325,7 @@ class CommDeviceTree : public CommDevice { } private: - void EnableP2P() { + void EnableP2P(std::vector* p2p) { #if MXNET_USE_CUDA std::vector gpus; for (const auto& d : devs_) { @@ -338,7 +335,8 @@ class CommDeviceTree : public CommDevice { } int n = static_cast(gpus.size()); int enabled = 0; - std::vector p2p(n*n); + p2p->clear(); + p2p->resize(n*n, 0); for (int i = 0; i < n; ++i) { mxnet::common::cuda::DeviceStore device_store(gpus[i]); for (int j = 0; j < n; j++) { @@ -348,7 +346,7 @@ class CommDeviceTree : public CommDevice { cudaError_t e = cudaDeviceEnablePeerAccess(gpus[j], 0); if (e == cudaSuccess || e == cudaErrorPeerAccessAlreadyEnabled) { ++enabled; - p2p[i*n+j] = 1; + (*p2p)[i*n+j] = 1; } } } @@ -362,7 +360,7 @@ class CommDeviceTree : public CommDevice { std::string access(n, '.'); for (int i = 0; i < n; ++i) { for (int j = 0; j < n; ++j) { - access[j] = p2p[i*n+j] ? 'v' : '.'; + access[j] = (*p2p)[i*n+j] ? 'v' : '.'; } LOG(WARNING) << access; } @@ -373,7 +371,9 @@ class CommDeviceTree : public CommDevice { void QueryTopology() { #if MXNET_USE_CUDA std::vector link_matrix(devs_.size()*devs_.size()); - GetP2PWeight(devs_, &link_matrix); + std::vector p2p_matrix(devs_.size()*devs_.size()); + EnableP2P(&p2p_matrix); + GetP2PWeight(devs_, p2p_matrix, &link_matrix); if (backtrack_) LOG(INFO) << "Using Backtracking to generate trees"; else diff --git a/src/kvstore/gpu_topology.h b/src/kvstore/gpu_topology.h index 92bcf1a5d263..777fb47f9945 100644 --- a/src/kvstore/gpu_topology.h +++ b/src/kvstore/gpu_topology.h @@ -123,6 +123,9 @@ inline bool IsConnected(const std::vector& matrix, int num_gpus) { /** * \brief Generate adjacency matrix with row/col numbering from 0, 1, ..., n_gpu * \param devs is a vector of GPU contexts + * \param p2p_matrix is adjacency matrix of P2P connections where + * 0: no P2P connection + * 1: P2P connection * \param matrix is adjacency matrix of link topology graph * where edge weight represents relative performance of NVIDIA GPUs * 0: Self-connection @@ -131,7 +134,9 @@ inline bool IsConnected(const std::vector& matrix, int num_gpus) { * 3: 2 NVLink connections */ template -inline void GetP2PWeight(const std::vector& devs, std::vector* matrix) { +inline void GetP2PWeight(const std::vector& devs, + const std::vector& p2p_matrix, + std::vector* matrix) { int num_gpus = devs.size(); int count = 0; std::vector zero_dev_id(num_gpus, -1); @@ -161,11 +166,61 @@ inline void GetP2PWeight(const std::vector& devs, std::vector* matri } } - // Check that all GPUs have at least 1 NVLink connection - int max_value = 0; - for (unsigned int i = 0; i < max.size(); ++i) { - if (max[i] > max_value) - max_value = max[i]; + // Check that all P2P connections are detected by GetP2PAttribute + // If yes, then continue as before + // If not, then treat fallback to using p2p_matrix (from EnableP2P) + // + // We have observed that with CUDA 9.0 p3.16xlarge: + // + // 0 2 2 3 3 1 1 1 . v v v v . . . + // 2 0 3 2 1 3 1 1 v . v v . v . . + // 2 3 0 3 1 1 2 1 v v . v . . v . + // 3 2 3 0 1 1 1 2 v v v . . . . v + // 3 1 1 1 0 2 2 3 v . . . . v v v + // 1 3 1 1 2 0 3 2 . v . . v . v v + // 1 1 2 1 2 3 0 3 . . v . v v . v + // 1 1 1 2 3 2 3 0 . . . v v v v . + // + // matrix p2p_matrix + // + // Here, they are correctly detected, because the 2s and 3s correspond to + // links that have P2P connections between them. However for CUDA 9.2 p3.16xlarge: + // + // 0 2 2 1 1 1 1 1 . v v v v . . . + // 2 0 1 2 1 1 1 1 v . v v . v . . + // 2 1 0 1 1 1 2 1 v v . v . . v . + // 1 2 1 0 1 1 1 2 v v v . . . . v + // 1 1 1 1 0 2 2 1 v . . . . v v v + // 1 1 1 1 2 0 1 2 . v . . v . v v + // 1 1 2 1 2 1 0 1 . . v . v v . v + // 1 1 1 2 1 2 1 0 . . . v v v v . + // + // matrix p2p_matrix + // + // The fastest connections (3 - double NVLink) are not recognized as being any + if (kLogTree) { + PrintMatrix("matrix", *matrix, num_gpus, num_gpus); + PrintMatrix("p2p_matrix", p2p_matrix, num_gpus, num_gpus); + } + + // different from (1 - non-P2P PCI-E). This is why we fallback to p2p_matrix. + bool matrix_correct = true; + for (unsigned i = 0; i < p2p_matrix.size(); ++i) { + if (p2p_matrix[i] > 0 && (*matrix)[i] == 1) { + matrix_correct = false; + break; + } + } + + if (!matrix_correct) { + LOG(WARNING) << "cudaDeviceGetP2PAttribute incorrect. " + << "Falling back to cudaDeviceEnablePeerAccess for topology detection"; + for (unsigned i = 0; i < p2p_matrix.size(); ++i) { + if (p2p_matrix[i] > 0) + (*matrix)[i] = 2; + else + (*matrix)[i] = 1; + } } // If all GPUs are connected by NVLink, then we can use NVLink only @@ -188,6 +243,8 @@ inline void GetP2PWeight(const std::vector& devs, std::vector* matri matrix_value = (matrix_value == 1) ? 1./num_gpus : matrix_value; } } + if (kLogTree) + PrintMatrix("Weight", *matrix, num_gpus, num_gpus); #else LOG(WARNING) << "GPU required for link topology";