FLAMEGPU · mondus · Oct 26, 2022 · Jun 17, 2022 · Jun 17, 2022 · Oct 20, 2022
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -118,6 +118,10 @@ get_property(isMultiConfig GLOBAL PROPERTY GENERATOR_IS_MULTI_CONFIG)
   unset(isMultiConfig)
 endif()
 
+# Option to change curand engine used for CUDA random generation
+set(CURAND_ENGINE "PHILOX" CACHE STRING "The curand engine to use. Suitable options: \"XORWOW\", \"PHILOX\", \"MRG\"")
+mark_as_advanced(CURAND_ENGINE)
+
 # If CUDA >= 11.2, add an option to enable using NVCC_THREASD
 if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 11.2)
     option(USE_NVCC_THREADS "Enable parallel compilation of multiple NVCC targets. See NVCC_THREADS for more control." ON)

diff --git a/README.md b/README.md
@@ -170,6 +170,7 @@ cmake --build . --target all
 | `EXPORT_RTC_SOURCES`     | `ON`/`OFF`        | At runtime, export dynamic RTC files to disk. Useful for debugging RTC models. Default `OFF`               |
 | `RTC_DISK_CACHE`         | `ON`/`OFF`        | Enable/Disable caching of RTC functions to disk. Default `ON`.                                             |
 | `VERBOSE_PTXAS`          | `ON`/`OFF`        | Enable verbose PTXAS output during compilation. Default `OFF`.                                             |
+| `CURAND_ENGINE`          | `XORWOW`/`PHILOX`/`MRG` | Select the CUDA random engine. Default `XORWOW`                                                      |
 | `USE_GLM`                | `ON`/`OFF`        | Experimental feature for GLM type support in RTC models. Default `OFF`.                                    |
 
 <!-- Additional options which users can find if they need them.

diff --git a/cmake/common.cmake b/cmake/common.cmake
@@ -270,6 +270,18 @@ function(CommonCompilerSettings)
     if(VERBOSE_PTXAS)
         target_compile_options(${CCS_TARGET} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xptxas -v>")
     endif()
+
+    # Request a specific curand engine
+    string(TOUPPER CURAND_ENGINE CURAND_ENGINE_UPPER)
+    if(${CURAND_ENGINE_UPPER} STREQUAL "MRG")
+        target_compile_definitions(${CCS_TARGET} PRIVATE CURAND_MRG32k3a)
+    elseif(${CURAND_ENGINE_UPPER} STREQUAL "PHILOX")
+        target_compile_definitions(${CCS_TARGET} PRIVATE CURAND_Philox4_32_10)
+    elseif(${CURAND_ENGINE_UPPER} STREQUAL "XORWOW")
+        target_compile_definitions(${CCS_TARGET} PRIVATE CURAND_XORWOW)
+    elseif(DEFINED CURAND_ENGINE)
+        message(FATAL_ERROR "${CURAND_ENGINE} is not a suitable value of CURAND_ENGINE\nOptions: \"MRG\", \"PHILOX\", \"XORWOW\"")
+    endif()
 
 endfunction()
 

diff --git a/include/flamegpu/runtime/AgentFunction.cuh b/include/flamegpu/runtime/AgentFunction.cuh
@@ -1,9 +1,7 @@
 #ifndef INCLUDE_FLAMEGPU_RUNTIME_AGENTFUNCTION_CUH_
 #define INCLUDE_FLAMEGPU_RUNTIME_AGENTFUNCTION_CUH_
 
-#include <cuda_runtime.h>
-#include <curand_kernel.h>
-
+#include "flamegpu/util/detail/curand.cuh"
 #include "flamegpu/runtime/detail/SharedBlock.h"
 #include "flamegpu/defines.h"
 #include "flamegpu/exception/FLAMEGPUDeviceException.cuh"
@@ -29,7 +27,7 @@ typedef void(AgentFunctionWrapper)(
     const unsigned int popNo,
     const void *in_messagelist_metadata,
     const void *out_messagelist_metadata,
-    curandState *d_rng,
+    util::detail::curandState *d_rng,
     unsigned int *scanFlag_agentDeath,
     unsigned int *scanFlag_messageOutput,
     unsigned int *scanFlag_agentOutput);  // Can't put __global__ in a typedef
@@ -65,7 +63,7 @@ __global__ void agent_function_wrapper(
     const unsigned int popNo,
     const void *in_messagelist_metadata,
     const void *out_messagelist_metadata,
-    curandState *d_rng,
+    util::detail::curandState *d_rng,
     unsigned int *scanFlag_agentDeath,
     unsigned int *scanFlag_messageOutput,
     unsigned int *scanFlag_agentOutput) {

diff --git a/include/flamegpu/runtime/AgentFunctionCondition.cuh b/include/flamegpu/runtime/AgentFunctionCondition.cuh
@@ -21,7 +21,7 @@ typedef void(AgentFunctionConditionWrapper)(
     const char* d_env_buffer,
 #endif
     const unsigned int popNo,
-    curandState *d_rng,
+    util::detail::curandState *d_rng,
     unsigned int *scanFlag_conditionResult);  // Can't put __global__ in a typedef
 
 /**
@@ -46,7 +46,7 @@ __global__ void agent_function_condition_wrapper(
     const char* d_env_buffer,
 #endif
     const unsigned int popNo,
-    curandState *d_rng,
+    util::detail::curandState *d_rng,
     unsigned int *scanFlag_conditionResult) {
     // We place these at the start of shared memory, so we can locate it anywhere in device code without a reference
     using detail::sm;

diff --git a/include/flamegpu/runtime/DeviceAPI.cuh b/include/flamegpu/runtime/DeviceAPI.cuh
@@ -47,14 +47,14 @@ class ReadOnlyDeviceAPI {
         const detail::curve::CurveTable *,
 #endif
         const unsigned int,
-        curandState *,
+        util::detail::curandState *,
         unsigned int *);
 
  public:
     /**
      * @param d_rng Pointer to the device random state buffer to be used
      */
-    __device__ ReadOnlyDeviceAPI(curandState *&d_rng)
+    __device__ ReadOnlyDeviceAPI(util::detail::curandState *&d_rng)
         : random(AgentRandom(&d_rng[getThreadIndex()]))
         , environment(DeviceEnvironment()) { }
     /**
@@ -154,7 +154,7 @@ class DeviceAPI {
         const unsigned int,
         const void *,
         const void *,
-        curandState *,
+        util::detail::curandState *,
         unsigned int *,
         unsigned int *,
         unsigned int *);
@@ -238,7 +238,7 @@ class DeviceAPI {
      */
     __device__ DeviceAPI(
         id_t *&d_agent_output_nextID,
-        curandState *&d_rng,
+        util::detail::curandState *&d_rng,
         unsigned int *&scanFlag_agentOutput,
         typename MessageIn::In &&message_in,
         typename MessageOut::Out &&message_out)

diff --git a/include/flamegpu/runtime/utility/AgentRandom.cuh b/include/flamegpu/runtime/utility/AgentRandom.cuh
@@ -1,9 +1,9 @@
 #ifndef INCLUDE_FLAMEGPU_RUNTIME_UTILITY_AGENTRANDOM_CUH_
 #define INCLUDE_FLAMEGPU_RUNTIME_UTILITY_AGENTRANDOM_CUH_
 
-#include <curand_kernel.h>
 #include <cassert>
 
+#include "flamegpu/util/detail/curand.cuh"
 #include "flamegpu/util/detail/StaticAssert.h"
 #include "flamegpu/exception/FLAMEGPUDeviceException.cuh"
 
@@ -21,7 +21,7 @@ class AgentRandom {
      * @param d_rng ThreadSafe device curand state instance
      *   this is a unique instance for the thread among all concurrently executing kernels
      */
-    __forceinline__ __device__ AgentRandom(curandState *d_rng);
+    __forceinline__ __device__ AgentRandom(util::detail::curandState *d_rng);
     /**
      * Returns a float uniformly distributed between 0.0 and 1.0. 
      * @note It may return from 0.0 to 1.0, where 1.0 is included and 0.0 is excluded.
@@ -56,10 +56,10 @@ class AgentRandom {
     /**
      * Thread-safe index for accessing curand
      */
-    curandState *d_random_state;
+    util::detail::curandState *d_random_state;
 };
 
-__forceinline__ __device__ AgentRandom::AgentRandom(curandState *d_rng) : d_random_state(d_rng) { }
+__forceinline__ __device__ AgentRandom::AgentRandom(util::detail::curandState *d_rng) : d_random_state(d_rng) { }
 /**
  * All templates are specialised
  */

diff --git a/include/flamegpu/runtime/utility/RandomManager.cuh b/include/flamegpu/runtime/utility/RandomManager.cuh
@@ -1,11 +1,11 @@
 #ifndef INCLUDE_FLAMEGPU_RUNTIME_UTILITY_RANDOMMANAGER_CUH_
 #define INCLUDE_FLAMEGPU_RUNTIME_UTILITY_RANDOMMANAGER_CUH_
 
-#include <curand_kernel.h>
 #include <cstdint>
 #include <random>
 #include <string>
 
+#include "flamegpu/util/detail/curand.cuh"
 #include "flamegpu/sim/Simulation.h"
 
 namespace flamegpu {
@@ -62,7 +62,7 @@ class RandomManager {
      *     while(length*shrinkModifier>_length)
      *       length*=shrinkModifier
      */
-    curandState *resize(size_type _length, cudaStream_t stream);
+    util::detail::curandState*resize(size_type _length, cudaStream_t stream);
     /**
      * Accessors
      */
@@ -84,14 +84,14 @@ class RandomManager {
      */
     size_type size();
     uint64_t seed();
-    curandState *cudaRandomState();
+    util::detail::curandState*cudaRandomState();
 
  private:
     /**
      * Device array holding curand states
      * They should always be initialised
      */
-    curandState *d_random_state = nullptr;
+     util::detail::curandState*d_random_state = nullptr;
     /**
      * Random seed used to initialise all currently allocated curand states
      */
@@ -127,7 +127,7 @@ class RandomManager {
      * @note h_max_random_state will be allocated to length h_max_random_size
      * However, it will only be initialised from hd_random_size(aka length) onwards
      */
-    curandState *h_max_random_state = nullptr;
+    util::detail::curandState *h_max_random_state = nullptr;
     /**
      * Allocated length of h_max_random_state
      */

diff --git a/include/flamegpu/util/detail/curand.cuh b/include/flamegpu/util/detail/curand.cuh
@@ -0,0 +1,26 @@
+#ifndef INCLUDE_FLAMEGPU_UTIL_DETAIL_CURAND_CUH_
+#define INCLUDE_FLAMEGPU_UTIL_DETAIL_CURAND_CUH_
+
+/**
+ * This header exists to allow a convenient way to switch between curand implementations
+ */
+
+#include <curand_kernel.h>
+
+namespace flamegpu {
+namespace util {
+namespace detail {
+
+#if defined(CURAND_MRG32k3a)
+typedef curandStateMRG32k3a_t curandState;
+#elif defined(CURAND_XORWOW)
+typedef curandStateXORWOW_t curandState;
+#else  // defined(CURAND_Philox4_32_10)
+typedef curandStatePhilox4_32_10_t curandState;
+#endif
+
+}  // namespace detail
+}  // namespace util
+}  // namespace flamegpu
+
+#endif  // INCLUDE_FLAMEGPU_UTIL_DETAIL_CURAND_CUH_
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -201,6 +201,7 @@ SET(SRC_INCLUDE
     ${FLAMEGPU_ROOT}/include/flamegpu/util/StringPair.h
     ${FLAMEGPU_ROOT}/include/flamegpu/util/type_decode.h
     ${FLAMEGPU_ROOT}/include/flamegpu/util/detail/compute_capability.cuh
+    ${FLAMEGPU_ROOT}/include/flamegpu/util/detail/curand.cuh
     ${FLAMEGPU_ROOT}/include/flamegpu/util/detail/wddm.cuh
     ${FLAMEGPU_ROOT}/include/flamegpu/util/detail/CUDAEventTimer.cuh
     ${FLAMEGPU_ROOT}/include/flamegpu/util/detail/cxxname.hpp

diff --git a/src/flamegpu/gpu/CUDASimulation.cu b/src/flamegpu/gpu/CUDASimulation.cu
@@ -1,10 +1,10 @@
 #include "flamegpu/gpu/CUDASimulation.h"
 
-#include <curand_kernel.h>
 
 #include <algorithm>
 #include <string>
 
+#include "flamegpu/util/detail/curand.cuh"
 #include "flamegpu/model/AgentFunctionData.cuh"
 #include "flamegpu/model/LayerData.h"
 #include "flamegpu/model/AgentDescription.h"
@@ -667,7 +667,7 @@ void CUDASimulation::stepLayer(const std::shared_ptr<LayerData>& layer, const un
     // If any condition kernel needs to be executed, do so, by checking the number of threads from before.
     if (totalThreads > 0) {
         // Ensure RandomManager is the correct size to accommodate all threads to be launched
-        curandState *d_rng = singletons->rng.resize(totalThreads, getStream(0));
+        util::detail::curandState *d_rng = singletons->rng.resize(totalThreads, getStream(0));
         // Track which stream to use for concurrency
         streamIdx = 0;
         // Sum the total number of threads being launched in the layer, for rng offsetting.
@@ -696,7 +696,7 @@ void CUDASimulation::stepLayer(const std::shared_ptr<LayerData>& layer, const un
                 int gridSize = 0;  // The actual grid size needed, based on input size
 
                 //  Agent function condition kernel wrapper args
-                curandState *t_rng = d_rng + totalThreads;
+                util::detail::curandState *t_rng = d_rng + totalThreads;
                 unsigned int *scanFlag_agentDeath = this->singletons->scatter.Scan().Config(CUDAScanCompaction::Type::AGENT_DEATH, streamIdx).d_ptrs.scan_flag;
 #if !defined(SEATBELTS) || SEATBELTS
                 auto *error_buffer = this->singletons->exception.getDevicePtr(streamIdx, this->getStream(streamIdx));
@@ -865,7 +865,7 @@ void CUDASimulation::stepLayer(const std::shared_ptr<LayerData>& layer, const un
     // If any kernel needs to be executed, do so, by checking the number of threads from before.
     if (totalThreads > 0) {
         // Ensure RandomManager is the correct size to accommodate all threads to be launched
-        curandState *d_rng = singletons->rng.resize(totalThreads, getStream(0));
+        util::detail::curandState *d_rng = singletons->rng.resize(totalThreads, getStream(0));
         // Total threads is now used to provide kernel launches an offset to thread-safe thread-index
         totalThreads = 0;
         streamIdx = 0;
@@ -918,7 +918,7 @@ void CUDASimulation::stepLayer(const std::shared_ptr<LayerData>& layer, const un
             int gridSize = 0;  // The actual grid size needed, based on input size
 
             // Agent function kernel wrapper args
-            curandState * t_rng = d_rng + totalThreads;
+            util::detail::curandState *t_rng = d_rng + totalThreads;
             unsigned int *scanFlag_agentDeath = func_des->has_agent_death ? this->singletons->scatter.Scan().Config(CUDAScanCompaction::Type::AGENT_DEATH, streamIdx).d_ptrs.scan_flag : nullptr;
             unsigned int *scanFlag_messageOutput = this->singletons->scatter.Scan().Config(CUDAScanCompaction::Type::MESSAGE_OUTPUT, streamIdx).d_ptrs.scan_flag;
             unsigned int *scanFlag_agentOutput = this->singletons->scatter.Scan().Config(CUDAScanCompaction::Type::AGENT_OUTPUT, streamIdx).d_ptrs.scan_flag;

diff --git a/src/flamegpu/runtime/utility/RandomManager.cu b/src/flamegpu/runtime/utility/RandomManager.cu
@@ -1,7 +1,6 @@
 #include "flamegpu/runtime/utility/RandomManager.cuh"
 
 #include <cuda_runtime.h>
-#include <curand_kernel.h>
 #include <device_launch_parameters.h>
 
 #include<ctime>
@@ -10,6 +9,7 @@
 #include <cstdio>
 #include <algorithm>
 
+#include "flamegpu/util/detail/curand.cuh"
 #include "flamegpu/gpu/detail/CUDAErrorChecking.cuh"
 #include "flamegpu/gpu/CUDASimulation.h"
 
@@ -81,7 +81,7 @@ void RandomManager::free() {
     freeDevice();
 }
 
-curandState *RandomManager::resize(size_type _length, cudaStream_t stream) {
+util::detail::curandState *RandomManager::resize(size_type _length, cudaStream_t stream) {
     assert(growthModifier > 1.0);
     assert(shrinkModifier > 0.0);
     assert(shrinkModifier <= 1.0);
@@ -104,7 +104,7 @@ curandState *RandomManager::resize(size_type _length, cudaStream_t stream) {
         resizeDeviceArray(t_length, stream);
     return d_random_state;
 }
-__global__ void init_curand(curandState *d_random_state, unsigned int threadCount, uint64_t seed, RandomManager::size_type offset) {
+__global__ void init_curand(util::detail::curandState *d_random_state, unsigned int threadCount, uint64_t seed, RandomManager::size_type offset) {
     int id = blockIdx.x * blockDim.x + threadIdx.x;
     if (id < threadCount)
         curand_init(seed, offset + id, 0, &d_random_state[offset + id]);
@@ -114,12 +114,12 @@ void RandomManager::resizeDeviceArray(const size_type &_length, cudaStream_t str
     deviceInitialised = true;
     if (_length > h_max_random_size) {
         // Growing array
-        curandState *t_hd_random_state = nullptr;
+        util::detail::curandState *t_hd_random_state = nullptr;
         // Allocate new mem to t_hd
-        gpuErrchk(cudaMalloc(&t_hd_random_state, _length * sizeof(curandState)));
+        gpuErrchk(cudaMalloc(&t_hd_random_state, _length * sizeof(util::detail::curandState)));
         // Copy hd->t_hd[****    ]
         if (d_random_state) {
-            gpuErrchk(cudaMemcpyAsync(t_hd_random_state, d_random_state, length * sizeof(curandState), cudaMemcpyDeviceToDevice, stream));
+            gpuErrchk(cudaMemcpyAsync(t_hd_random_state, d_random_state, length * sizeof(util::detail::curandState), cudaMemcpyDeviceToDevice, stream));
         }
         // Update pointers hd=t_hd
         if (d_random_state) {
@@ -131,7 +131,7 @@ void RandomManager::resizeDeviceArray(const size_type &_length, cudaStream_t str
             // We have part/all host backup, copy to device array
             // Reinit backup[    **  ]
             const size_type copy_len = std::min(h_max_random_size, _length);
-            gpuErrchk(cudaMemcpyAsync(d_random_state + length, h_max_random_state + length, copy_len * sizeof(curandState), cudaMemcpyHostToDevice, stream));  // Host not pinned
+            gpuErrchk(cudaMemcpyAsync(d_random_state + length, h_max_random_state + length, copy_len * sizeof(util::detail::curandState), cudaMemcpyHostToDevice, stream));  // Host not pinned
             length += copy_len;
         }
         if (_length > length) {
@@ -143,20 +143,20 @@ void RandomManager::resizeDeviceArray(const size_type &_length, cudaStream_t str
         }
     } else {
         // Shrinking array
-        curandState *t_hd_random_state = nullptr;
-        curandState *t_h_max_random_state = nullptr;
+        util::detail::curandState *t_hd_random_state = nullptr;
+        util::detail::curandState *t_h_max_random_state = nullptr;
         // Allocate new
-        gpuErrchk(cudaMalloc(&t_hd_random_state, _length * sizeof(curandState)));
+        gpuErrchk(cudaMalloc(&t_hd_random_state, _length * sizeof(util::detail::curandState)));
         // Allocate host backup
         if (length > h_max_random_size)
-            t_h_max_random_state = reinterpret_cast<curandState *>(malloc(length * sizeof(curandState)));
+            t_h_max_random_state = reinterpret_cast<util::detail::curandState*>(malloc(length * sizeof(util::detail::curandState)));
         else
             t_h_max_random_state = h_max_random_state;
         // Copy old->new
         assert(d_random_state);
-        gpuErrchk(cudaMemcpyAsync(t_hd_random_state, d_random_state, _length * sizeof(curandState), cudaMemcpyDeviceToDevice, stream));
+        gpuErrchk(cudaMemcpyAsync(t_hd_random_state, d_random_state, _length * sizeof(util::detail::curandState), cudaMemcpyDeviceToDevice, stream));
         // Copy part being shrunk away to host storage (This could be async with above memcpy?)
-        gpuErrchk(cudaMemcpyAsync(t_h_max_random_state + _length, d_random_state + _length, (length - _length) * sizeof(curandState), cudaMemcpyDeviceToHost, stream));
+        gpuErrchk(cudaMemcpyAsync(t_h_max_random_state + _length, d_random_state + _length, (length - _length) * sizeof(util::detail::curandState), cudaMemcpyDeviceToHost, stream));
         // Release and replace old host ptr
         if (length > h_max_random_size) {
             if (h_max_random_state)
@@ -196,7 +196,7 @@ RandomManager::size_type RandomManager::size() {
 uint64_t RandomManager::seed() {
     return mSeed;
 }
-curandState *RandomManager::cudaRandomState() {
+util::detail::curandState *RandomManager::cudaRandomState() {
     return d_random_state;
 }