Skip to content

Commit

Permalink
Set NVRTC gpu-architecture flag to maximum supported version for nvrt…
Browse files Browse the repository at this point in the history
…c & device

Closes #844

The maximum compute capability supported by the currently linked NVRTC that is less than or equal to the device's architecture is used for RTC compilation.

This fixes an issue where running an RTC model on consume ampere (SM_86) would fail on CUDA 11.0 and older, which are not aware of SM_86's existence.

CUDA 11.2+ includes methods to query which architectures are supported by the dynamically linked NVRTC (which may add or remove architectures in new releases, and due to a stable ABI from 11.2 for all 11.x releases the linked version can be different than the version available at compile time).
CUDA 11.1 and below (11.1, 11.0 and 10.x currently in our case) do not include these methods, and due to the absence of a stable nvrtc ABI for these versions the known values can be hardcoded at compile time (grim but simple).

A method to select the most appropriate value form an ascending order vector has also been introduced, so this gencode functionality can be programmatically tested without having to predict what values would be appropriate based on the current device and the cuda version used, which is a moving target.
  • Loading branch information
ptheywood committed May 6, 2022
1 parent 14d4530 commit 3b0c3e9
Show file tree
Hide file tree
Showing 4 changed files with 150 additions and 6 deletions.
21 changes: 21 additions & 0 deletions include/flamegpu/util/detail/compute_capability.cuh
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
#ifndef INCLUDE_FLAMEGPU_UTIL_DETAIL_COMPUTE_CAPABILITY_CUH_
#define INCLUDE_FLAMEGPU_UTIL_DETAIL_COMPUTE_CAPABILITY_CUH_

#include <vector>

#include "flamegpu/gpu/detail/CUDAErrorChecking.cuh"

namespace flamegpu {
Expand Down Expand Up @@ -30,6 +32,25 @@ int minimumCompiledComputeCapability();
*/
bool checkComputeCapability(int deviceIndex);

/**
* Get the comptue capabilities supported by the linked NVRTC, irrespective of whether FLAMEGPU was configured for that architecture.
* CUDA 11.2 or greater provides methods to make this dynamic. Older versions of CUDA are hardcoded (11.1, 11.0 and 10.x only).
* @return vector of compute capability integers ((major * 10) + minor) in ascending order
*/
std::vector<int> getNVRTCSupportedComputeCapabilties();


/**
* Get the best matching compute capability from a vector of compute capabililties in ascending order
* I.e. get the maximum CC value which is less than or equal to the target CC
*
* This method has been separated from JitifyCache::compileKernel so that it can be tested generically, without having to write tests which are relative to the linked nvrtc and/or the current device.
*
* @param target compute capability to find the best match for
* @param archictectures a vector of architectures in ascending order
* @return the best compute capability to use (the largest value LE target), or 0 if none are appropriate.
*/
int selectAppropraiteComputeCapability(const int target, const std::vector<int>& architectures);
} // namespace compute_capability
} // namespace detail
} // namespace util
Expand Down
22 changes: 16 additions & 6 deletions src/flamegpu/util/detail/JitifyCache.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#include "flamegpu/util/detail/JitifyCache.h"

#include <nvrtc.h>

#include <cassert>
#include <regex>
#include <array>
Expand Down Expand Up @@ -315,12 +317,20 @@ std::unique_ptr<KernelInstantiation> JitifyCache::compileKernel(const std::strin
}
#endif

// Set the compilation architecture target if it was successfully detected.
int currentDeviceIdx = 0;
cudaError_t status = cudaGetDevice(&currentDeviceIdx);
if (status == cudaSuccess) {
int arch = compute_capability::getComputeCapability(currentDeviceIdx);
options.push_back(std::string("--gpu-architecture=compute_" + std::to_string(arch)));
// Set the cuda compuate capability architecture to optimize / generate for, based on the values supported by the current dynamiclaly linked nvrtc and the device in question.
std::vector<int> nvrtcArchitectures = util::detail::compute_capability::getNVRTCSupportedComputeCapabilties();
if (nvrtcArchitectures.size()) {
int currentDeviceIdx = 0;
if (cudaSuccess == cudaGetDevice(&currentDeviceIdx)) {
int arch = compute_capability::getComputeCapability(currentDeviceIdx);
int maxSupportedArch = compute_capability::selectAppropraiteComputeCapability(arch, nvrtcArchitectures);
// only set a nvrtc compilation flag if a usable value was found
if (maxSupportedArch != 0) {
options.push_back(std::string("--gpu-architecture=compute_" + std::to_string(maxSupportedArch)));
}
// Optionally we could error here, but this *should* never occur
// else { }
}
}

// If CUDA is compiled with -G (--device-debug) forward it to the compiler, otherwise forward lineinfo for profiling.
Expand Down
55 changes: 55 additions & 0 deletions src/flamegpu/util/detail/compute_capability.cu
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
#include <nvrtc.h>

#include <cassert>

#include "flamegpu/util/detail/compute_capability.cuh"
#include "flamegpu/gpu/detail/CUDAErrorChecking.cuh"


namespace flamegpu {
namespace util {
namespace detail {
Expand Down Expand Up @@ -47,6 +52,56 @@ bool compute_capability::checkComputeCapability(int deviceIndex) {
}
}

std::vector<int> compute_capability::getNVRTCSupportedComputeCapabilties() {
// NVRTC included with CUDA 11.2+ includes methods to query the supported architectures and CUDA from 11.2+
// Also changes the soname rules such that nvrtc.11.2.so is vald for all nvrtc >= 11.2, and libnvrtc.12.so for CUDA 12.x etc, so this is different at runtime not compile time for future versions, so use the methods
#if (__CUDACC_VER_MAJOR__ > 11) || ((__CUDACC_VER_MAJOR__ == 11) && __CUDACC_VER_MINOR__ >= 2)
nvrtcResult nvrtcStatus = NVRTC_SUCCESS;
int nvrtcNumSupportedArchs = 0;
// Query the number of architecture flags supported by this nvrtc, to allocate enough memory
nvrtcStatus = nvrtcGetNumSupportedArchs(&nvrtcNumSupportedArchs);
if (nvrtcStatus == NVRTC_SUCCESS && nvrtcNumSupportedArchs > 0) {
// prepare a large enough std::vector for the results
std::vector<int> nvrtcSupportedArchs = std::vector<int>(nvrtcNumSupportedArchs);
assert(nvrtcSupportedArchs.size() >= nvrtcNumSupportedArchs);
nvrtcStatus = nvrtcGetSupportedArchs(nvrtcSupportedArchs.data());
if (nvrtcStatus == NVRTC_SUCCESS) {
// Return the populated std::vector, this should be RVO'd
return nvrtcSupportedArchs;
}
}
// If any of the above functions failed, we have no idea what arch's are supported, so assume none are?
return {};
// Older CUDA's do not support this, but this is simple to hard-code for CUDA 11.0/11.1 (and our deprected CUDA 10.x).
// CUDA 11.1 suports 35 to 86
#elif (__CUDACC_VER_MAJOR__ == 11) && __CUDACC_VER_MINOR__ == 1
return {35, 37, 50, 52, 53, 60, 61, 62, 70, 72, 75, 80, 86};
// CUDA 11.0 supports 35 to 80
#elif (__CUDACC_VER_MAJOR__ == 11) && __CUDACC_VER_MINOR__ == 0
return {35, 37, 50, 52, 53, 60, 61, 62, 70, 72, 75, 80};
// CUDA 10.x supports 30 to 75
#elif (__CUDACC_VER_MAJOR__ >= 10)
return {30, 32, 35, 37, 50, 52, 53, 60, 61, 62, 70, 72, 75};
// This should be all cases for FLAME GPU 2, but leave the fallback branch just in case
#else
return {};
#endif
}

int compute_capability::selectAppropraiteComputeCapability(const int target, const std::vector<int>& architectures) {
int maxArch = 0;
for (const int &arch : architectures) {
if (arch <= target && arch > maxArch) {
maxArch = arch;
// The vector is in ascending order, so we can potentially early exit
if (arch == target) {
return target;
}
}
}
return maxArch;
}

} // namespace detail
} // namespace util
} // namespace flamegpu
58 changes: 58 additions & 0 deletions tests/test_cases/util/test_compute_capability.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#include <cuda_runtime.h>

#include <vector>
#include "flamegpu/util/detail/compute_capability.cuh"
#include "flamegpu/gpu/detail/CUDAErrorChecking.cuh"

Expand Down Expand Up @@ -61,4 +62,61 @@ TEST(TestUtilComputeCapability, checkComputeCapability) {
EXPECT_ANY_THROW(util::detail::compute_capability::checkComputeCapability(-1));
EXPECT_ANY_THROW(util::detail::compute_capability::checkComputeCapability(device_count));
}

/**
* Test getting the nvrtc supported compute capabilities.
* This depends on the CUDA version used, and the dynamically linked nvrtc (when CUDA >= 11.2) so this is not ideal to test.
*/
TEST(TestUtilComputeCapability, getNVRTCSupportedComputeCapabilties) {
std::vector<int> architectures = util::detail::compute_capability::getNVRTCSupportedComputeCapabilties();

// CUDA 11.2+ we do not know what values or how many this should return, so just assume a non zero number will be returned (in case of future additions / removals)
#if (__CUDACC_VER_MAJOR__ > 11) || ((__CUDACC_VER_MAJOR__ == 11) && __CUDACC_VER_MINOR__ >= 2)
EXPECT_GT(architectures.size(), 0);
// CUDA 11.1 suports 35 to 86, (13 arch's)
#elif (__CUDACC_VER_MAJOR__ == 11) && __CUDACC_VER_MINOR__ == 1
EXPECT_EQ(architectures.size(), 13);
// CUDA 11.0 supports 35 to 80 (12 arch's)
#elif (__CUDACC_VER_MAJOR__ == 11) && __CUDACC_VER_MINOR__ == 0
EXPECT_EQ(architectures.size(), 12);
// CUDA 10.x supports 30 to 75 (13 arch's)
#elif (__CUDACC_VER_MAJOR__ >= 10)
EXPECT_EQ(architectures.size(), 13);
// Otherwise there will be 0.
#else
EXPECT_EQ(architectures.size(), 0);
#endif
}

/**
* Test that given an ascending order of compute capabilities, and a target compute capability, greatest value which is LE the target is found, or 0 otherwise.
*/
TEST(TestUtilComputeCapability, selectAppropraiteComputeCapability) {
// Check an exact match should be found
EXPECT_EQ(util::detail::compute_capability::selectAppropraiteComputeCapability(86, {86}), 86);
// Check a miss but with a lower value returns the lower value
EXPECT_EQ(util::detail::compute_capability::selectAppropraiteComputeCapability(86, {80}), 80);
// Check a miss without a valid value returns 0
EXPECT_EQ(util::detail::compute_capability::selectAppropraiteComputeCapability(86, {90}), 0);
// Check a miss occurs when no values are present in the vector.
EXPECT_EQ(util::detail::compute_capability::selectAppropraiteComputeCapability(86, {}), 0);

// CUDA 11.1-11.6, 35 to 86, 86 and 60 should be found, 30 should not.
std::vector<int> CUDA_11_1_ARCHES = {35, 37, 50, 52, 53, 60, 61, 62, 70, 72, 75, 80, 86};
EXPECT_EQ(util::detail::compute_capability::selectAppropraiteComputeCapability(86, CUDA_11_1_ARCHES), 86);
EXPECT_EQ(util::detail::compute_capability::selectAppropraiteComputeCapability(60, CUDA_11_1_ARCHES), 60);
EXPECT_EQ(util::detail::compute_capability::selectAppropraiteComputeCapability(30, CUDA_11_1_ARCHES), 0);

// CUDA 11.0, 86 should not be found, but 80 should be used instead. 60 should be found, 30 should not.
std::vector<int> CUDA_11_0_ARCHES = {35, 37, 50, 52, 53, 60, 61, 62, 70, 72, 75, 80};
EXPECT_EQ(util::detail::compute_capability::selectAppropraiteComputeCapability(86, CUDA_11_0_ARCHES), 80);
EXPECT_EQ(util::detail::compute_capability::selectAppropraiteComputeCapability(60, CUDA_11_0_ARCHES), 60);
EXPECT_EQ(util::detail::compute_capability::selectAppropraiteComputeCapability(30, CUDA_11_0_ARCHES), 0);
// CUDA 10.0, 86 should not be found, 75 should be used. 60 should be found, 30 should eb found.
std::vector<int> CUDA_10_0_ARCHES = {30, 32, 35, 37, 50, 52, 53, 60, 61, 62, 70, 72, 75};
EXPECT_EQ(util::detail::compute_capability::selectAppropraiteComputeCapability(86, CUDA_10_0_ARCHES), 75);
EXPECT_EQ(util::detail::compute_capability::selectAppropraiteComputeCapability(60, CUDA_10_0_ARCHES), 60);
EXPECT_EQ(util::detail::compute_capability::selectAppropraiteComputeCapability(30, CUDA_10_0_ARCHES), 30);
}

} // namespace flamegpu

0 comments on commit 3b0c3e9

Please sign in to comment.