From 3b0c3e96a51023ed1e017b7c7fdb3d6d034c8842 Mon Sep 17 00:00:00 2001
From: Peter Heywood <peethwd@gmail.com>
Date: Fri, 6 May 2022 17:56:06 +0100
Subject: [PATCH] Set NVRTC gpu-architecture flag to maximum supported version
 for nvrtc & device

Closes #844

The maximum compute capability supported by the currently linked NVRTC that is less than or equal to the device's architecture is used for RTC compilation.

This fixes an issue where running an RTC model on consume ampere (SM_86) would fail on CUDA 11.0 and older, which are not aware of SM_86's existence.

CUDA 11.2+ includes methods to query which architectures are supported by the dynamically linked NVRTC (which may add or remove architectures in new releases, and due to a stable ABI from 11.2 for all 11.x releases the linked version can be different than the version available at compile time).
CUDA 11.1 and below (11.1, 11.0 and 10.x currently in our case) do not include these methods, and due to the absence of a stable nvrtc ABI for these versions the known values can be hardcoded at compile time (grim but simple).

A method to select the most appropriate value form an ascending order vector has also been introduced, so this gencode functionality can be programmatically tested without having to predict what values would be appropriate based on the current device and the cuda version used, which is a moving target.
---
 .../util/detail/compute_capability.cuh        | 21 +++++++
 src/flamegpu/util/detail/JitifyCache.cu       | 22 +++++--
 .../util/detail/compute_capability.cu         | 55 ++++++++++++++++++
 .../util/test_compute_capability.cu           | 58 +++++++++++++++++++
 4 files changed, 150 insertions(+), 6 deletions(-)
diff --git a/include/flamegpu/util/detail/compute_capability.cuh b/include/flamegpu/util/detail/compute_capability.cuh
index 709d333c6..9f22d395c 100644
--- a/include/flamegpu/util/detail/compute_capability.cuh
+++ b/include/flamegpu/util/detail/compute_capability.cuh
@@ -1,6 +1,8 @@
 #ifndef INCLUDE_FLAMEGPU_UTIL_DETAIL_COMPUTE_CAPABILITY_CUH_
 #define INCLUDE_FLAMEGPU_UTIL_DETAIL_COMPUTE_CAPABILITY_CUH_
 
+#include <vector>
+
 #include "flamegpu/gpu/detail/CUDAErrorChecking.cuh"
 
 namespace flamegpu {
@@ -30,6 +32,25 @@ int minimumCompiledComputeCapability();
  */
 bool checkComputeCapability(int deviceIndex);
 
+/**
+ * Get the comptue capabilities supported by the linked NVRTC, irrespective of whether FLAMEGPU was configured for that architecture.
+ * CUDA 11.2 or greater provides methods to make this dynamic. Older versions of CUDA are hardcoded (11.1, 11.0 and 10.x only).
+ * @return vector of compute capability integers ((major * 10) + minor) in ascending order 
+ */
+std::vector<int> getNVRTCSupportedComputeCapabilties();
+
+
+/**
+ * Get the best matching compute capability from a vector of compute capabililties in ascending order
+ * I.e. get the maximum CC value which is less than or equal to the target CC
+ *
+ * This method has been separated from JitifyCache::compileKernel so that it can be tested generically, without having to write tests which are relative to the linked nvrtc and/or the current device.
+ * 
+ * @param target compute capability to find the best match for
+ * @param archictectures a vector of architectures in ascending order
+ * @return the best compute capability to use (the largest value LE target), or 0 if none are appropriate.
+ */
+int selectAppropraiteComputeCapability(const int target, const std::vector<int>& architectures);
 }  // namespace compute_capability
 }  // namespace detail
 }  // namespace util
diff --git a/src/flamegpu/util/detail/JitifyCache.cu b/src/flamegpu/util/detail/JitifyCache.cu
index dd3fc4bf0..d1826bef4 100644
--- a/src/flamegpu/util/detail/JitifyCache.cu
+++ b/src/flamegpu/util/detail/JitifyCache.cu
@@ -1,5 +1,7 @@
 #include "flamegpu/util/detail/JitifyCache.h"
 
+#include <nvrtc.h>
+
 #include <cassert>
 #include <regex>
 #include <array>
@@ -315,12 +317,20 @@ std::unique_ptr<KernelInstantiation> JitifyCache::compileKernel(const std::strin
     }
 #endif
 
-    // Set the compilation architecture target if it was successfully detected.
-    int currentDeviceIdx = 0;
-    cudaError_t status = cudaGetDevice(&currentDeviceIdx);
-    if (status == cudaSuccess) {
-        int arch = compute_capability::getComputeCapability(currentDeviceIdx);
-        options.push_back(std::string("--gpu-architecture=compute_" + std::to_string(arch)));
+    // Set the cuda compuate capability architecture to optimize / generate for, based on the values supported by the current dynamiclaly linked nvrtc and the device in question.
+    std::vector<int> nvrtcArchitectures = util::detail::compute_capability::getNVRTCSupportedComputeCapabilties();
+    if (nvrtcArchitectures.size()) {
+        int currentDeviceIdx = 0;
+        if (cudaSuccess == cudaGetDevice(&currentDeviceIdx)) {
+            int arch = compute_capability::getComputeCapability(currentDeviceIdx);
+            int maxSupportedArch = compute_capability::selectAppropraiteComputeCapability(arch, nvrtcArchitectures);
+            // only set a nvrtc compilation flag if a usable value was found
+            if (maxSupportedArch != 0) {
+                options.push_back(std::string("--gpu-architecture=compute_" + std::to_string(maxSupportedArch)));
+            }
+            // Optionally we could error here, but this *should* never occur
+            // else { }
+        }
     }
 
     // If CUDA is compiled with -G (--device-debug) forward it to the compiler, otherwise forward lineinfo for profiling.
diff --git a/src/flamegpu/util/detail/compute_capability.cu b/src/flamegpu/util/detail/compute_capability.cu
index ab622040f..4d4fd6fff 100644
--- a/src/flamegpu/util/detail/compute_capability.cu
+++ b/src/flamegpu/util/detail/compute_capability.cu
@@ -1,6 +1,11 @@
+#include <nvrtc.h>
+
+#include <cassert>
+
 #include "flamegpu/util/detail/compute_capability.cuh"
 #include "flamegpu/gpu/detail/CUDAErrorChecking.cuh"
 
+
 namespace flamegpu {
 namespace util {
 namespace detail {
@@ -47,6 +52,56 @@ bool compute_capability::checkComputeCapability(int deviceIndex) {
     }
 }
 
+std::vector<int> compute_capability::getNVRTCSupportedComputeCapabilties() {
+// NVRTC included with CUDA 11.2+ includes methods to query the supported architectures and CUDA from 11.2+
+// Also changes the soname rules such that nvrtc.11.2.so is vald for all nvrtc >= 11.2, and libnvrtc.12.so for CUDA 12.x etc, so this is different at runtime not compile time for future versions, so use the methods
+#if (__CUDACC_VER_MAJOR__ > 11) || ((__CUDACC_VER_MAJOR__ == 11) && __CUDACC_VER_MINOR__ >= 2)
+    nvrtcResult nvrtcStatus = NVRTC_SUCCESS;
+    int nvrtcNumSupportedArchs = 0;
+    // Query the number of architecture flags supported by this nvrtc, to allocate enough memory
+    nvrtcStatus = nvrtcGetNumSupportedArchs(&nvrtcNumSupportedArchs);
+    if (nvrtcStatus == NVRTC_SUCCESS && nvrtcNumSupportedArchs > 0) {
+        // prepare a large enough std::vector for the results
+        std::vector<int> nvrtcSupportedArchs = std::vector<int>(nvrtcNumSupportedArchs);
+        assert(nvrtcSupportedArchs.size() >= nvrtcNumSupportedArchs);
+        nvrtcStatus = nvrtcGetSupportedArchs(nvrtcSupportedArchs.data());
+        if (nvrtcStatus == NVRTC_SUCCESS) {
+            // Return the populated std::vector, this should be RVO'd
+            return nvrtcSupportedArchs;
+        }
+    }
+    // If any of the above functions failed, we have no idea what arch's are supported, so assume none are?
+    return {};
+// Older CUDA's do not support this, but this is simple to hard-code for CUDA 11.0/11.1  (and our deprected CUDA 10.x).
+// CUDA 11.1 suports 35 to 86
+#elif (__CUDACC_VER_MAJOR__ == 11) && __CUDACC_VER_MINOR__ == 1
+    return {35, 37, 50, 52, 53, 60, 61, 62, 70, 72, 75, 80, 86};
+// CUDA 11.0 supports 35 to 80
+#elif (__CUDACC_VER_MAJOR__ == 11) && __CUDACC_VER_MINOR__ == 0
+    return {35, 37, 50, 52, 53, 60, 61, 62, 70, 72, 75, 80};
+// CUDA 10.x supports 30 to 75
+#elif (__CUDACC_VER_MAJOR__ >= 10)
+    return {30, 32, 35, 37, 50, 52, 53, 60, 61, 62, 70, 72, 75};
+// This should be all cases for FLAME GPU 2, but leave the fallback branch just in case
+#else
+    return {};
+#endif
+}
+
+int compute_capability::selectAppropraiteComputeCapability(const int target, const std::vector<int>& architectures) {
+    int maxArch = 0;
+    for (const int &arch : architectures) {
+        if (arch <= target && arch > maxArch) {
+            maxArch = arch;
+            // The vector is in ascending order, so we can potentially early exit
+            if (arch == target) {
+                return target;
+            }
+        }
+    }
+    return maxArch;
+}
+
 }  // namespace detail
 }  // namespace util
 }  // namespace flamegpu
diff --git a/tests/test_cases/util/test_compute_capability.cu b/tests/test_cases/util/test_compute_capability.cu
index ef1c463d3..528ff5b66 100644
--- a/tests/test_cases/util/test_compute_capability.cu
+++ b/tests/test_cases/util/test_compute_capability.cu
@@ -1,5 +1,6 @@
 #include <cuda_runtime.h>
 
+#include <vector>
 #include "flamegpu/util/detail/compute_capability.cuh"
 #include "flamegpu/gpu/detail/CUDAErrorChecking.cuh"
 
@@ -61,4 +62,61 @@ TEST(TestUtilComputeCapability, checkComputeCapability) {
     EXPECT_ANY_THROW(util::detail::compute_capability::checkComputeCapability(-1));
     EXPECT_ANY_THROW(util::detail::compute_capability::checkComputeCapability(device_count));
 }
+
+/**
+ * Test getting the nvrtc supported compute capabilities. 
+ * This depends on the CUDA version used, and the dynamically linked nvrtc (when CUDA >= 11.2) so this is not ideal to test. 
+ */
+TEST(TestUtilComputeCapability, getNVRTCSupportedComputeCapabilties) {
+    std::vector<int> architectures = util::detail::compute_capability::getNVRTCSupportedComputeCapabilties();
+
+    // CUDA 11.2+ we do not know what values or how many this should return, so just assume a non zero number will be returned (in case of future additions / removals)
+    #if (__CUDACC_VER_MAJOR__ > 11) || ((__CUDACC_VER_MAJOR__ == 11) && __CUDACC_VER_MINOR__ >= 2)
+        EXPECT_GT(architectures.size(), 0);
+    // CUDA 11.1 suports 35 to 86, (13 arch's)
+    #elif (__CUDACC_VER_MAJOR__ == 11) && __CUDACC_VER_MINOR__ == 1
+        EXPECT_EQ(architectures.size(), 13);
+    // CUDA 11.0 supports 35 to 80 (12 arch's)
+    #elif (__CUDACC_VER_MAJOR__ == 11) && __CUDACC_VER_MINOR__ == 0
+        EXPECT_EQ(architectures.size(), 12);
+    // CUDA 10.x supports 30 to 75 (13 arch's)
+    #elif (__CUDACC_VER_MAJOR__ >= 10)
+        EXPECT_EQ(architectures.size(), 13);
+    // Otherwise there will be 0.
+    #else
+        EXPECT_EQ(architectures.size(), 0);
+    #endif
+}
+
+/**
+ * Test that given an ascending order of compute capabilities, and a target compute capability, greatest value which is LE the target is found, or 0 otherwise.
+ */
+TEST(TestUtilComputeCapability, selectAppropraiteComputeCapability) {
+    // Check an exact match should be found
+    EXPECT_EQ(util::detail::compute_capability::selectAppropraiteComputeCapability(86, {86}), 86);
+    // Check a miss but with a lower value returns the lower value
+    EXPECT_EQ(util::detail::compute_capability::selectAppropraiteComputeCapability(86, {80}), 80);
+    // Check a miss without a valid value returns 0
+    EXPECT_EQ(util::detail::compute_capability::selectAppropraiteComputeCapability(86, {90}), 0);
+    // Check a miss occurs when no values are present in the vector.
+    EXPECT_EQ(util::detail::compute_capability::selectAppropraiteComputeCapability(86, {}), 0);
+
+    // CUDA 11.1-11.6, 35 to 86, 86 and 60 should be found, 30 should not.
+    std::vector<int> CUDA_11_1_ARCHES = {35, 37, 50, 52, 53, 60, 61, 62, 70, 72, 75, 80, 86};
+    EXPECT_EQ(util::detail::compute_capability::selectAppropraiteComputeCapability(86, CUDA_11_1_ARCHES), 86);
+    EXPECT_EQ(util::detail::compute_capability::selectAppropraiteComputeCapability(60, CUDA_11_1_ARCHES), 60);
+    EXPECT_EQ(util::detail::compute_capability::selectAppropraiteComputeCapability(30, CUDA_11_1_ARCHES), 0);
+
+    // CUDA 11.0, 86 should not be found, but 80 should be used instead. 60 should be found, 30 should not.
+    std::vector<int> CUDA_11_0_ARCHES = {35, 37, 50, 52, 53, 60, 61, 62, 70, 72, 75, 80};
+    EXPECT_EQ(util::detail::compute_capability::selectAppropraiteComputeCapability(86, CUDA_11_0_ARCHES), 80);
+    EXPECT_EQ(util::detail::compute_capability::selectAppropraiteComputeCapability(60, CUDA_11_0_ARCHES), 60);
+    EXPECT_EQ(util::detail::compute_capability::selectAppropraiteComputeCapability(30, CUDA_11_0_ARCHES), 0);
+    // CUDA 10.0, 86 should not be found, 75 should be used. 60 should be found, 30 should eb found.
+    std::vector<int> CUDA_10_0_ARCHES = {30, 32, 35, 37, 50, 52, 53, 60, 61, 62, 70, 72, 75};
+    EXPECT_EQ(util::detail::compute_capability::selectAppropraiteComputeCapability(86, CUDA_10_0_ARCHES), 75);
+    EXPECT_EQ(util::detail::compute_capability::selectAppropraiteComputeCapability(60, CUDA_10_0_ARCHES), 60);
+    EXPECT_EQ(util::detail::compute_capability::selectAppropraiteComputeCapability(30, CUDA_10_0_ARCHES), 30);
+}
+
 }  // namespace flamegpu