From 71ffce829047f6a0b953aadf4cde7ab5af7d1793 Mon Sep 17 00:00:00 2001 From: Robert Chisholm Date: Fri, 17 Jun 2022 16:47:46 +0100 Subject: [PATCH 1/3] Move curand behind a custom typedef. This allows the backing engine to be swapped with a preprocessor definition Have ran full test suite with Philox, and AgentRandom suite with MRG32 --- include/flamegpu/runtime/AgentFunction.cuh | 8 ++---- .../runtime/AgentFunctionCondition.cuh | 4 +-- include/flamegpu/runtime/DeviceAPI.cuh | 8 +++--- .../flamegpu/runtime/utility/AgentRandom.cuh | 8 +++--- .../runtime/utility/RandomManager.cuh | 10 +++---- include/flamegpu/util/detail/curand.cuh | 18 ++++++++++++ src/CMakeLists.txt | 1 + src/flamegpu/gpu/CUDASimulation.cu | 10 +++---- src/flamegpu/runtime/utility/RandomManager.cu | 28 +++++++++---------- src/flamegpu/util/detail/JitifyCache.cu | 19 +++++++++++++ 10 files changed, 75 insertions(+), 39 deletions(-) create mode 100644 include/flamegpu/util/detail/curand.cuh diff --git a/include/flamegpu/runtime/AgentFunction.cuh b/include/flamegpu/runtime/AgentFunction.cuh index c553d295c..ce19476a8 100644 --- a/include/flamegpu/runtime/AgentFunction.cuh +++ b/include/flamegpu/runtime/AgentFunction.cuh @@ -1,9 +1,7 @@ #ifndef INCLUDE_FLAMEGPU_RUNTIME_AGENTFUNCTION_CUH_ #define INCLUDE_FLAMEGPU_RUNTIME_AGENTFUNCTION_CUH_ -#include -#include - +#include "flamegpu/util/detail/curand.cuh" #include "flamegpu/runtime/detail/SharedBlock.h" #include "flamegpu/defines.h" #include "flamegpu/exception/FLAMEGPUDeviceException.cuh" @@ -29,7 +27,7 @@ typedef void(AgentFunctionWrapper)( const unsigned int popNo, const void *in_messagelist_metadata, const void *out_messagelist_metadata, - curandState *d_rng, + curandStateFLAMEGPU *d_rng, unsigned int *scanFlag_agentDeath, unsigned int *scanFlag_messageOutput, unsigned int *scanFlag_agentOutput); // Can't put __global__ in a typedef @@ -65,7 +63,7 @@ __global__ void agent_function_wrapper( const unsigned int popNo, const void *in_messagelist_metadata, const void *out_messagelist_metadata, - curandState *d_rng, + curandStateFLAMEGPU *d_rng, unsigned int *scanFlag_agentDeath, unsigned int *scanFlag_messageOutput, unsigned int *scanFlag_agentOutput) { diff --git a/include/flamegpu/runtime/AgentFunctionCondition.cuh b/include/flamegpu/runtime/AgentFunctionCondition.cuh index 7362a6dfe..72bea6a22 100644 --- a/include/flamegpu/runtime/AgentFunctionCondition.cuh +++ b/include/flamegpu/runtime/AgentFunctionCondition.cuh @@ -21,7 +21,7 @@ typedef void(AgentFunctionConditionWrapper)( const char* d_env_buffer, #endif const unsigned int popNo, - curandState *d_rng, + curandStateFLAMEGPU *d_rng, unsigned int *scanFlag_conditionResult); // Can't put __global__ in a typedef /** @@ -46,7 +46,7 @@ __global__ void agent_function_condition_wrapper( const char* d_env_buffer, #endif const unsigned int popNo, - curandState *d_rng, + curandStateFLAMEGPU *d_rng, unsigned int *scanFlag_conditionResult) { // We place these at the start of shared memory, so we can locate it anywhere in device code without a reference using detail::sm; diff --git a/include/flamegpu/runtime/DeviceAPI.cuh b/include/flamegpu/runtime/DeviceAPI.cuh index c3462947f..09964fdc9 100644 --- a/include/flamegpu/runtime/DeviceAPI.cuh +++ b/include/flamegpu/runtime/DeviceAPI.cuh @@ -47,14 +47,14 @@ class ReadOnlyDeviceAPI { const detail::curve::CurveTable *, #endif const unsigned int, - curandState *, + curandStateFLAMEGPU *, unsigned int *); public: /** * @param d_rng Pointer to the device random state buffer to be used */ - __device__ ReadOnlyDeviceAPI(curandState *&d_rng) + __device__ ReadOnlyDeviceAPI(curandStateFLAMEGPU *&d_rng) : random(AgentRandom(&d_rng[getThreadIndex()])) , environment(DeviceEnvironment()) { } /** @@ -154,7 +154,7 @@ class DeviceAPI { const unsigned int, const void *, const void *, - curandState *, + curandStateFLAMEGPU *, unsigned int *, unsigned int *, unsigned int *); @@ -238,7 +238,7 @@ class DeviceAPI { */ __device__ DeviceAPI( id_t *&d_agent_output_nextID, - curandState *&d_rng, + curandStateFLAMEGPU *&d_rng, unsigned int *&scanFlag_agentOutput, typename MessageIn::In &&message_in, typename MessageOut::Out &&message_out) diff --git a/include/flamegpu/runtime/utility/AgentRandom.cuh b/include/flamegpu/runtime/utility/AgentRandom.cuh index 853bb3d5c..9a3f3e1cc 100644 --- a/include/flamegpu/runtime/utility/AgentRandom.cuh +++ b/include/flamegpu/runtime/utility/AgentRandom.cuh @@ -1,9 +1,9 @@ #ifndef INCLUDE_FLAMEGPU_RUNTIME_UTILITY_AGENTRANDOM_CUH_ #define INCLUDE_FLAMEGPU_RUNTIME_UTILITY_AGENTRANDOM_CUH_ -#include #include +#include "flamegpu/util/detail/curand.cuh" #include "flamegpu/util/detail/StaticAssert.h" #include "flamegpu/exception/FLAMEGPUDeviceException.cuh" @@ -21,7 +21,7 @@ class AgentRandom { * @param d_rng ThreadSafe device curand state instance * this is a unique instance for the thread among all concurrently executing kernels */ - __forceinline__ __device__ AgentRandom(curandState *d_rng); + __forceinline__ __device__ AgentRandom(curandStateFLAMEGPU *d_rng); /** * Returns a float uniformly distributed between 0.0 and 1.0. * @note It may return from 0.0 to 1.0, where 1.0 is included and 0.0 is excluded. @@ -56,10 +56,10 @@ class AgentRandom { /** * Thread-safe index for accessing curand */ - curandState *d_random_state; + curandStateFLAMEGPU *d_random_state; }; -__forceinline__ __device__ AgentRandom::AgentRandom(curandState *d_rng) : d_random_state(d_rng) { } +__forceinline__ __device__ AgentRandom::AgentRandom(curandStateFLAMEGPU *d_rng) : d_random_state(d_rng) { } /** * All templates are specialised */ diff --git a/include/flamegpu/runtime/utility/RandomManager.cuh b/include/flamegpu/runtime/utility/RandomManager.cuh index 1a5583857..9f842645a 100644 --- a/include/flamegpu/runtime/utility/RandomManager.cuh +++ b/include/flamegpu/runtime/utility/RandomManager.cuh @@ -1,11 +1,11 @@ #ifndef INCLUDE_FLAMEGPU_RUNTIME_UTILITY_RANDOMMANAGER_CUH_ #define INCLUDE_FLAMEGPU_RUNTIME_UTILITY_RANDOMMANAGER_CUH_ -#include #include #include #include +#include "flamegpu/util/detail/curand.cuh" #include "flamegpu/sim/Simulation.h" namespace flamegpu { @@ -62,7 +62,7 @@ class RandomManager { * while(length*shrinkModifier>_length) * length*=shrinkModifier */ - curandState *resize(size_type _length, cudaStream_t stream); + curandStateFLAMEGPU *resize(size_type _length, cudaStream_t stream); /** * Accessors */ @@ -84,14 +84,14 @@ class RandomManager { */ size_type size(); uint64_t seed(); - curandState *cudaRandomState(); + curandStateFLAMEGPU *cudaRandomState(); private: /** * Device array holding curand states * They should always be initialised */ - curandState *d_random_state = nullptr; + curandStateFLAMEGPU *d_random_state = nullptr; /** * Random seed used to initialise all currently allocated curand states */ @@ -127,7 +127,7 @@ class RandomManager { * @note h_max_random_state will be allocated to length h_max_random_size * However, it will only be initialised from hd_random_size(aka length) onwards */ - curandState *h_max_random_state = nullptr; + curandStateFLAMEGPU *h_max_random_state = nullptr; /** * Allocated length of h_max_random_state */ diff --git a/include/flamegpu/util/detail/curand.cuh b/include/flamegpu/util/detail/curand.cuh new file mode 100644 index 000000000..107f1f15f --- /dev/null +++ b/include/flamegpu/util/detail/curand.cuh @@ -0,0 +1,18 @@ +#ifndef INCLUDE_FLAMEGPU_UTIL_DETAIL_CURAND_CUH_ +#define INCLUDE_FLAMEGPU_UTIL_DETAIL_CURAND_CUH_ + +/** + * This header exists to allow a convenient way to switch between curand implementations + */ + +#include + +#if defined(CURAND_MRG32k3a) +typedef curandStateMRG32k3a_t curandStateFLAMEGPU; +#elif defined(CURAND_Philox4_32_10) +typedef curandStatePhilox4_32_10_t curandStateFLAMEGPU; +#else // defined(CURAND_XORWOW) +typedef curandStateXORWOW_t curandStateFLAMEGPU; +#endif + +#endif // INCLUDE_FLAMEGPU_UTIL_DETAIL_CURAND_CUH_ diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 54b4b4f15..fc107cfa1 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -201,6 +201,7 @@ SET(SRC_INCLUDE ${FLAMEGPU_ROOT}/include/flamegpu/util/StringPair.h ${FLAMEGPU_ROOT}/include/flamegpu/util/type_decode.h ${FLAMEGPU_ROOT}/include/flamegpu/util/detail/compute_capability.cuh + ${FLAMEGPU_ROOT}/include/flamegpu/util/detail/curand.cuh ${FLAMEGPU_ROOT}/include/flamegpu/util/detail/wddm.cuh ${FLAMEGPU_ROOT}/include/flamegpu/util/detail/CUDAEventTimer.cuh ${FLAMEGPU_ROOT}/include/flamegpu/util/detail/cxxname.hpp diff --git a/src/flamegpu/gpu/CUDASimulation.cu b/src/flamegpu/gpu/CUDASimulation.cu index bfd4b2eb4..88a2e0e82 100644 --- a/src/flamegpu/gpu/CUDASimulation.cu +++ b/src/flamegpu/gpu/CUDASimulation.cu @@ -1,10 +1,10 @@ #include "flamegpu/gpu/CUDASimulation.h" -#include #include #include +#include "flamegpu/util/detail/curand.cuh" #include "flamegpu/model/AgentFunctionData.cuh" #include "flamegpu/model/LayerData.h" #include "flamegpu/model/AgentDescription.h" @@ -667,7 +667,7 @@ void CUDASimulation::stepLayer(const std::shared_ptr& layer, const un // If any condition kernel needs to be executed, do so, by checking the number of threads from before. if (totalThreads > 0) { // Ensure RandomManager is the correct size to accommodate all threads to be launched - curandState *d_rng = singletons->rng.resize(totalThreads, getStream(0)); + curandStateFLAMEGPU *d_rng = singletons->rng.resize(totalThreads, getStream(0)); // Track which stream to use for concurrency streamIdx = 0; // Sum the total number of threads being launched in the layer, for rng offsetting. @@ -696,7 +696,7 @@ void CUDASimulation::stepLayer(const std::shared_ptr& layer, const un int gridSize = 0; // The actual grid size needed, based on input size // Agent function condition kernel wrapper args - curandState *t_rng = d_rng + totalThreads; + curandStateFLAMEGPU *t_rng = d_rng + totalThreads; unsigned int *scanFlag_agentDeath = this->singletons->scatter.Scan().Config(CUDAScanCompaction::Type::AGENT_DEATH, streamIdx).d_ptrs.scan_flag; #if !defined(SEATBELTS) || SEATBELTS auto *error_buffer = this->singletons->exception.getDevicePtr(streamIdx, this->getStream(streamIdx)); @@ -865,7 +865,7 @@ void CUDASimulation::stepLayer(const std::shared_ptr& layer, const un // If any kernel needs to be executed, do so, by checking the number of threads from before. if (totalThreads > 0) { // Ensure RandomManager is the correct size to accommodate all threads to be launched - curandState *d_rng = singletons->rng.resize(totalThreads, getStream(0)); + curandStateFLAMEGPU *d_rng = singletons->rng.resize(totalThreads, getStream(0)); // Total threads is now used to provide kernel launches an offset to thread-safe thread-index totalThreads = 0; streamIdx = 0; @@ -918,7 +918,7 @@ void CUDASimulation::stepLayer(const std::shared_ptr& layer, const un int gridSize = 0; // The actual grid size needed, based on input size // Agent function kernel wrapper args - curandState * t_rng = d_rng + totalThreads; + curandStateFLAMEGPU *t_rng = d_rng + totalThreads; unsigned int *scanFlag_agentDeath = func_des->has_agent_death ? this->singletons->scatter.Scan().Config(CUDAScanCompaction::Type::AGENT_DEATH, streamIdx).d_ptrs.scan_flag : nullptr; unsigned int *scanFlag_messageOutput = this->singletons->scatter.Scan().Config(CUDAScanCompaction::Type::MESSAGE_OUTPUT, streamIdx).d_ptrs.scan_flag; unsigned int *scanFlag_agentOutput = this->singletons->scatter.Scan().Config(CUDAScanCompaction::Type::AGENT_OUTPUT, streamIdx).d_ptrs.scan_flag; diff --git a/src/flamegpu/runtime/utility/RandomManager.cu b/src/flamegpu/runtime/utility/RandomManager.cu index ba4970da6..643a7c9b7 100644 --- a/src/flamegpu/runtime/utility/RandomManager.cu +++ b/src/flamegpu/runtime/utility/RandomManager.cu @@ -1,7 +1,6 @@ #include "flamegpu/runtime/utility/RandomManager.cuh" #include -#include #include #include @@ -10,6 +9,7 @@ #include #include +#include "flamegpu/util/detail/curand.cuh" #include "flamegpu/gpu/detail/CUDAErrorChecking.cuh" #include "flamegpu/gpu/CUDASimulation.h" @@ -81,7 +81,7 @@ void RandomManager::free() { freeDevice(); } -curandState *RandomManager::resize(size_type _length, cudaStream_t stream) { +curandStateFLAMEGPU *RandomManager::resize(size_type _length, cudaStream_t stream) { assert(growthModifier > 1.0); assert(shrinkModifier > 0.0); assert(shrinkModifier <= 1.0); @@ -104,7 +104,7 @@ curandState *RandomManager::resize(size_type _length, cudaStream_t stream) { resizeDeviceArray(t_length, stream); return d_random_state; } -__global__ void init_curand(curandState *d_random_state, unsigned int threadCount, uint64_t seed, RandomManager::size_type offset) { +__global__ void init_curand(curandStateFLAMEGPU *d_random_state, unsigned int threadCount, uint64_t seed, RandomManager::size_type offset) { int id = blockIdx.x * blockDim.x + threadIdx.x; if (id < threadCount) curand_init(seed, offset + id, 0, &d_random_state[offset + id]); @@ -114,12 +114,12 @@ void RandomManager::resizeDeviceArray(const size_type &_length, cudaStream_t str deviceInitialised = true; if (_length > h_max_random_size) { // Growing array - curandState *t_hd_random_state = nullptr; + curandStateFLAMEGPU *t_hd_random_state = nullptr; // Allocate new mem to t_hd - gpuErrchk(cudaMalloc(&t_hd_random_state, _length * sizeof(curandState))); + gpuErrchk(cudaMalloc(&t_hd_random_state, _length * sizeof(curandStateFLAMEGPU))); // Copy hd->t_hd[**** ] if (d_random_state) { - gpuErrchk(cudaMemcpyAsync(t_hd_random_state, d_random_state, length * sizeof(curandState), cudaMemcpyDeviceToDevice, stream)); + gpuErrchk(cudaMemcpyAsync(t_hd_random_state, d_random_state, length * sizeof(curandStateFLAMEGPU), cudaMemcpyDeviceToDevice, stream)); } // Update pointers hd=t_hd if (d_random_state) { @@ -131,7 +131,7 @@ void RandomManager::resizeDeviceArray(const size_type &_length, cudaStream_t str // We have part/all host backup, copy to device array // Reinit backup[ ** ] const size_type copy_len = std::min(h_max_random_size, _length); - gpuErrchk(cudaMemcpyAsync(d_random_state + length, h_max_random_state + length, copy_len * sizeof(curandState), cudaMemcpyHostToDevice, stream)); // Host not pinned + gpuErrchk(cudaMemcpyAsync(d_random_state + length, h_max_random_state + length, copy_len * sizeof(curandStateFLAMEGPU), cudaMemcpyHostToDevice, stream)); // Host not pinned length += copy_len; } if (_length > length) { @@ -143,20 +143,20 @@ void RandomManager::resizeDeviceArray(const size_type &_length, cudaStream_t str } } else { // Shrinking array - curandState *t_hd_random_state = nullptr; - curandState *t_h_max_random_state = nullptr; + curandStateFLAMEGPU *t_hd_random_state = nullptr; + curandStateFLAMEGPU *t_h_max_random_state = nullptr; // Allocate new - gpuErrchk(cudaMalloc(&t_hd_random_state, _length * sizeof(curandState))); + gpuErrchk(cudaMalloc(&t_hd_random_state, _length * sizeof(curandStateFLAMEGPU))); // Allocate host backup if (length > h_max_random_size) - t_h_max_random_state = reinterpret_cast(malloc(length * sizeof(curandState))); + t_h_max_random_state = reinterpret_cast(malloc(length * sizeof(curandStateFLAMEGPU))); else t_h_max_random_state = h_max_random_state; // Copy old->new assert(d_random_state); - gpuErrchk(cudaMemcpyAsync(t_hd_random_state, d_random_state, _length * sizeof(curandState), cudaMemcpyDeviceToDevice, stream)); + gpuErrchk(cudaMemcpyAsync(t_hd_random_state, d_random_state, _length * sizeof(curandStateFLAMEGPU), cudaMemcpyDeviceToDevice, stream)); // Copy part being shrunk away to host storage (This could be async with above memcpy?) - gpuErrchk(cudaMemcpyAsync(t_h_max_random_state + _length, d_random_state + _length, (length - _length) * sizeof(curandState), cudaMemcpyDeviceToHost, stream)); + gpuErrchk(cudaMemcpyAsync(t_h_max_random_state + _length, d_random_state + _length, (length - _length) * sizeof(curandStateFLAMEGPU), cudaMemcpyDeviceToHost, stream)); // Release and replace old host ptr if (length > h_max_random_size) { if (h_max_random_state) @@ -196,7 +196,7 @@ RandomManager::size_type RandomManager::size() { uint64_t RandomManager::seed() { return mSeed; } -curandState *RandomManager::cudaRandomState() { +curandStateFLAMEGPU *RandomManager::cudaRandomState() { return d_random_state; } diff --git a/src/flamegpu/util/detail/JitifyCache.cu b/src/flamegpu/util/detail/JitifyCache.cu index 57ad1230f..e9110a308 100644 --- a/src/flamegpu/util/detail/JitifyCache.cu +++ b/src/flamegpu/util/detail/JitifyCache.cu @@ -299,6 +299,15 @@ std::unique_ptr JitifyCache::compileKernel(const std::strin } #endif + // Forward the curand Engine request +#if defined(CURAND_MRG32k3a) + options.push_back(std::string("-DCURAND_MRG32k3a")); +#elif defined(CURAND_Philox4_32_10) + options.push_back(std::string("-DCURAND_Philox4_32_10")); +#elif defined(CURAND_XORWOW) + options.push_back(std::string("-DCURAND_XORWOW")); +#endif + // Set the cuda compuate capability architecture to optimize / generate for, based on the values supported by the current dynamiclaly linked nvrtc and the device in question. std::vector nvrtcArchitectures = util::detail::compute_capability::getNVRTCSupportedComputeCapabilties(); if (nvrtcArchitectures.size()) { @@ -466,6 +475,16 @@ std::unique_ptr JitifyCache::loadKernel(const std::string & arch + "_" + seatbelts + "_" + std::string(flamegpu::VERSION_FULL) + "_" + +#ifdef USE_GLM + "glm_" + +#endif +#if defined(CURAND_MRG32k3a) + "MRG_" + +#elif defined(CURAND_Philox4_32_10) + "PHILOX_" + +#elif defined(CURAND_XORWOW) + "XORWOW_" + +#endif // Use jitify hash methods for consistent hashing between OSs std::to_string(hash_combine(hash_larson64(kernel_src.c_str()), hash_larson64(dynamic_header.c_str()))); // Does a copy with the right reference exist in memory? From 3c3f3ec8cfdcc5fa7921f2ff5bac3f5bc89518ea Mon Sep 17 00:00:00 2001 From: Robert Chisholm Date: Fri, 17 Jun 2022 17:31:37 +0100 Subject: [PATCH 2/3] Add CMake option CURAND_ENGINE String with suitable values MRG, PHILOX and XORWOW (case-insensitive), other values will produce an error at configure time. --- CMakeLists.txt | 4 +++ README.md | 1 + cmake/common.cmake | 12 +++++++++ include/flamegpu/runtime/AgentFunction.cuh | 4 +-- .../runtime/AgentFunctionCondition.cuh | 4 +-- include/flamegpu/runtime/DeviceAPI.cuh | 8 +++--- .../flamegpu/runtime/utility/AgentRandom.cuh | 6 ++--- .../runtime/utility/RandomManager.cuh | 8 +++--- include/flamegpu/util/detail/curand.cuh | 14 +++++++--- src/flamegpu/gpu/CUDASimulation.cu | 8 +++--- src/flamegpu/runtime/utility/RandomManager.cu | 26 +++++++++---------- swig/python/CMakeLists.txt | 14 +++++++++- 12 files changed, 73 insertions(+), 36 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index b02e2301d..6feb2d682 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -118,6 +118,10 @@ get_property(isMultiConfig GLOBAL PROPERTY GENERATOR_IS_MULTI_CONFIG) unset(isMultiConfig) endif() +# Option to change curand engine used for CUDA random generation +set(CURAND_ENGINE "XORWOW" CACHE STRING "The curand engine to use. Suitable options: \"XORWOW\", \"PHILOX\", \"MRG\"") +mark_as_advanced(CURAND_ENGINE) + # If CUDA >= 11.2, add an option to enable using NVCC_THREASD if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 11.2) option(USE_NVCC_THREADS "Enable parallel compilation of multiple NVCC targets. See NVCC_THREADS for more control." ON) diff --git a/README.md b/README.md index d0defda8f..c6fc61059 100644 --- a/README.md +++ b/README.md @@ -170,6 +170,7 @@ cmake --build . --target all | `EXPORT_RTC_SOURCES` | `ON`/`OFF` | At runtime, export dynamic RTC files to disk. Useful for debugging RTC models. Default `OFF` | | `RTC_DISK_CACHE` | `ON`/`OFF` | Enable/Disable caching of RTC functions to disk. Default `ON`. | | `VERBOSE_PTXAS` | `ON`/`OFF` | Enable verbose PTXAS output during compilation. Default `OFF`. | +| `CURAND_ENGINE` | `XORWOW`/`PHILOX`/`MRG` | Select the CUDA random engine. Default `XORWOW` | | `USE_GLM` | `ON`/`OFF` | Experimental feature for GLM type support in RTC models. Default `OFF`. |