From 0d766750695ef7c0f3152a4f7556eb951ad24688 Mon Sep 17 00:00:00 2001
From: Carl Yang <carl14706@gmail.com>
Date: Fri, 11 Jan 2019 22:15:58 -0800
Subject: [PATCH] Fix Tree Reduction on new instance type p3dn.24xlarge
 (#13852)

* add fallback for gpu topology detection using CUDA 9.2

* add fallback for gpu topology detection using CUDA 9.2

* add log

* update 3rdparty to master

* add fallback for gpu topology detection using CUDA 9.2

* add log

* update 3rdparty to master

* bring 3rdparty packages to upstream/master

* rebase to master

* Update gpu_topology.h
---
 src/kvstore/comm_tree.h    | 16 ++++-----
 src/kvstore/gpu_topology.h | 69 ++++++++++++++++++++++++++++++++++----
 2 files changed, 71 insertions(+), 14 deletions(-)
diff --git a/src/kvstore/comm_tree.h b/src/kvstore/comm_tree.h
index b62228cd2885..11d99c021917 100644
--- a/src/kvstore/comm_tree.h
+++ b/src/kvstore/comm_tree.h
@@ -77,9 +77,6 @@ class CommDeviceTree : public CommDevice {
       //  BroadcastRowSparse
       InitMergeBuffer(devs_);
       InitMergeBufferTree();
-      if (dmlc::GetEnv("MXNET_ENABLE_GPU_P2P", 1)) {
-        EnableP2P();
-      }
     }
   }
 
@@ -328,7 +325,7 @@ class CommDeviceTree : public CommDevice {
   }
 
  private:
-  void EnableP2P() {
+  void EnableP2P(std::vector<int>* p2p) {
 #if MXNET_USE_CUDA
     std::vector<int> gpus;
     for (const auto& d : devs_) {
@@ -338,7 +335,8 @@ class CommDeviceTree : public CommDevice {
     }
     int n = static_cast<int>(gpus.size());
     int enabled = 0;
-    std::vector<int> p2p(n*n);
+    p2p->clear();
+    p2p->resize(n*n, 0);
     for (int i = 0; i < n; ++i) {
       mxnet::common::cuda::DeviceStore device_store(gpus[i]);
       for (int j = 0; j < n; j++) {
@@ -348,7 +346,7 @@ class CommDeviceTree : public CommDevice {
           cudaError_t e = cudaDeviceEnablePeerAccess(gpus[j], 0);
           if (e == cudaSuccess || e == cudaErrorPeerAccessAlreadyEnabled) {
             ++enabled;
-            p2p[i*n+j] = 1;
+            (*p2p)[i*n+j] = 1;
           }
         }
       }
@@ -362,7 +360,7 @@ class CommDeviceTree : public CommDevice {
       std::string access(n, '.');
       for (int i = 0; i < n; ++i) {
         for (int j = 0; j < n; ++j) {
-          access[j] = p2p[i*n+j] ? 'v' : '.';
+          access[j] = (*p2p)[i*n+j] ? 'v' : '.';
         }
         LOG(WARNING) << access;
       }
@@ -373,7 +371,9 @@ class CommDeviceTree : public CommDevice {
   void QueryTopology() {
 #if MXNET_USE_CUDA
     std::vector<float> link_matrix(devs_.size()*devs_.size());
-    GetP2PWeight(devs_, &link_matrix);
+    std::vector<int> p2p_matrix(devs_.size()*devs_.size());
+    EnableP2P(&p2p_matrix);
+    GetP2PWeight(devs_, p2p_matrix, &link_matrix);
     if (backtrack_)
       LOG(INFO) << "Using Backtracking to generate trees";
     else
diff --git a/src/kvstore/gpu_topology.h b/src/kvstore/gpu_topology.h
index 92bcf1a5d263..777fb47f9945 100644
--- a/src/kvstore/gpu_topology.h
+++ b/src/kvstore/gpu_topology.h
@@ -123,6 +123,9 @@ inline bool IsConnected(const std::vector<T>& matrix, int num_gpus) {
 /**
  * \brief Generate adjacency matrix with row/col numbering from 0, 1, ..., n_gpu
  * \param devs is a vector of GPU contexts
+ * \param p2p_matrix is adjacency matrix of P2P connections where
+ *          0: no P2P connection
+ *          1: P2P connection
  * \param matrix is adjacency matrix of link topology graph
  *        where edge weight represents relative performance of NVIDIA GPUs
  *          0: Self-connection
@@ -131,7 +134,9 @@ inline bool IsConnected(const std::vector<T>& matrix, int num_gpus) {
  *          3: 2 NVLink connections
  */
 template <typename T>
-inline void GetP2PWeight(const std::vector<Context>& devs, std::vector<T>* matrix) {
+inline void GetP2PWeight(const std::vector<Context>& devs,
+                         const std::vector<int>& p2p_matrix,
+                         std::vector<T>* matrix) {
   int num_gpus = devs.size();
   int count    = 0;
   std::vector<int> zero_dev_id(num_gpus, -1);
@@ -161,11 +166,61 @@ inline void GetP2PWeight(const std::vector<Context>& devs, std::vector<T>* matri
     }
   }
 
-  // Check that all GPUs have at least 1 NVLink connection
-  int max_value = 0;
-  for (unsigned int i = 0; i < max.size(); ++i) {
-    if (max[i] > max_value)
-      max_value = max[i];
+  // Check that all P2P connections are detected by GetP2PAttribute
+  // If yes, then continue as before
+  // If not, then treat fallback to using p2p_matrix (from EnableP2P)
+  //
+  // We have observed that with CUDA 9.0 p3.16xlarge:
+  //
+  //   0 2 2 3 3 1 1 1    . v v v v . . .
+  //   2 0 3 2 1 3 1 1    v . v v . v . .
+  //   2 3 0 3 1 1 2 1    v v . v . . v .
+  //   3 2 3 0 1 1 1 2    v v v . . . . v
+  //   3 1 1 1 0 2 2 3    v . . . . v v v
+  //   1 3 1 1 2 0 3 2    . v . . v . v v
+  //   1 1 2 1 2 3 0 3    . . v . v v . v
+  //   1 1 1 2 3 2 3 0    . . . v v v v .
+  //
+  //        matrix           p2p_matrix
+  //
+  // Here, they are correctly detected, because the 2s and 3s correspond to
+  // links that have P2P connections between them. However for CUDA 9.2 p3.16xlarge:
+  //
+  //   0 2 2 1 1 1 1 1    . v v v v . . .
+  //   2 0 1 2 1 1 1 1    v . v v . v . .
+  //   2 1 0 1 1 1 2 1    v v . v . . v .
+  //   1 2 1 0 1 1 1 2    v v v . . . . v
+  //   1 1 1 1 0 2 2 1    v . . . . v v v
+  //   1 1 1 1 2 0 1 2    . v . . v . v v
+  //   1 1 2 1 2 1 0 1    . . v . v v . v
+  //   1 1 1 2 1 2 1 0    . . . v v v v .
+  //
+  //        matrix          p2p_matrix
+  //
+  // The fastest connections (3 - double NVLink) are not recognized as being any
+  if (kLogTree) {
+    PrintMatrix("matrix", *matrix, num_gpus, num_gpus);
+    PrintMatrix("p2p_matrix", p2p_matrix, num_gpus, num_gpus);
+  }
+
+  // different from (1 - non-P2P PCI-E). This is why we fallback to p2p_matrix.
+  bool matrix_correct = true;
+  for (unsigned i = 0; i < p2p_matrix.size(); ++i) {
+    if (p2p_matrix[i] > 0 && (*matrix)[i] == 1) {
+      matrix_correct = false;
+      break;
+    }
+  }
+
+  if (!matrix_correct) {
+    LOG(WARNING) << "cudaDeviceGetP2PAttribute incorrect. "
+                 << "Falling back to cudaDeviceEnablePeerAccess for topology detection";
+    for (unsigned i = 0; i < p2p_matrix.size(); ++i) {
+      if (p2p_matrix[i] > 0)
+        (*matrix)[i] = 2;
+      else
+        (*matrix)[i] = 1;
+    }
   }
 
   // If all GPUs are connected by NVLink, then we can use NVLink only
@@ -188,6 +243,8 @@ inline void GetP2PWeight(const std::vector<Context>& devs, std::vector<T>* matri
       matrix_value = (matrix_value == 1) ? 1./num_gpus : matrix_value;
     }
   }
+  if (kLogTree)
+    PrintMatrix("Weight", *matrix, num_gpus, num_gpus);
 
 #else
   LOG(WARNING) << "GPU required for link topology";