From 71ffce829047f6a0b953aadf4cde7ab5af7d1793 Mon Sep 17 00:00:00 2001
From: Robert Chisholm <robadob@robadob.org>
Date: Fri, 17 Jun 2022 16:47:46 +0100
Subject: [PATCH 1/3] Move curand behind a custom typedef.

This allows the backing engine to be swapped with a preprocessor definition

Have ran full test suite with Philox, and AgentRandom suite with MRG32
---
 include/flamegpu/runtime/AgentFunction.cuh    |  8 ++----
 .../runtime/AgentFunctionCondition.cuh        |  4 +--
 include/flamegpu/runtime/DeviceAPI.cuh        |  8 +++---
 .../flamegpu/runtime/utility/AgentRandom.cuh  |  8 +++---
 .../runtime/utility/RandomManager.cuh         | 10 +++----
 include/flamegpu/util/detail/curand.cuh       | 18 ++++++++++++
 src/CMakeLists.txt                            |  1 +
 src/flamegpu/gpu/CUDASimulation.cu            | 10 +++----
 src/flamegpu/runtime/utility/RandomManager.cu | 28 +++++++++----------
 src/flamegpu/util/detail/JitifyCache.cu       | 19 +++++++++++++
 10 files changed, 75 insertions(+), 39 deletions(-)
 create mode 100644 include/flamegpu/util/detail/curand.cuh

diff --git a/include/flamegpu/runtime/AgentFunction.cuh b/include/flamegpu/runtime/AgentFunction.cuh
index c553d295c..ce19476a8 100644
--- a/include/flamegpu/runtime/AgentFunction.cuh
+++ b/include/flamegpu/runtime/AgentFunction.cuh
@@ -1,9 +1,7 @@
 #ifndef INCLUDE_FLAMEGPU_RUNTIME_AGENTFUNCTION_CUH_
 #define INCLUDE_FLAMEGPU_RUNTIME_AGENTFUNCTION_CUH_
 
-#include <cuda_runtime.h>
-#include <curand_kernel.h>
-
+#include "flamegpu/util/detail/curand.cuh"
 #include "flamegpu/runtime/detail/SharedBlock.h"
 #include "flamegpu/defines.h"
 #include "flamegpu/exception/FLAMEGPUDeviceException.cuh"
@@ -29,7 +27,7 @@ typedef void(AgentFunctionWrapper)(
     const unsigned int popNo,
     const void *in_messagelist_metadata,
     const void *out_messagelist_metadata,
-    curandState *d_rng,
+    curandStateFLAMEGPU *d_rng,
     unsigned int *scanFlag_agentDeath,
     unsigned int *scanFlag_messageOutput,
     unsigned int *scanFlag_agentOutput);  // Can't put __global__ in a typedef
@@ -65,7 +63,7 @@ __global__ void agent_function_wrapper(
     const unsigned int popNo,
     const void *in_messagelist_metadata,
     const void *out_messagelist_metadata,
-    curandState *d_rng,
+    curandStateFLAMEGPU *d_rng,
     unsigned int *scanFlag_agentDeath,
     unsigned int *scanFlag_messageOutput,
     unsigned int *scanFlag_agentOutput) {
diff --git a/include/flamegpu/runtime/AgentFunctionCondition.cuh b/include/flamegpu/runtime/AgentFunctionCondition.cuh
index 7362a6dfe..72bea6a22 100644
--- a/include/flamegpu/runtime/AgentFunctionCondition.cuh
+++ b/include/flamegpu/runtime/AgentFunctionCondition.cuh
@@ -21,7 +21,7 @@ typedef void(AgentFunctionConditionWrapper)(
     const char* d_env_buffer,
 #endif
     const unsigned int popNo,
-    curandState *d_rng,
+    curandStateFLAMEGPU *d_rng,
     unsigned int *scanFlag_conditionResult);  // Can't put __global__ in a typedef
 
 /**
@@ -46,7 +46,7 @@ __global__ void agent_function_condition_wrapper(
     const char* d_env_buffer,
 #endif
     const unsigned int popNo,
-    curandState *d_rng,
+    curandStateFLAMEGPU *d_rng,
     unsigned int *scanFlag_conditionResult) {
     // We place these at the start of shared memory, so we can locate it anywhere in device code without a reference
     using detail::sm;
diff --git a/include/flamegpu/runtime/DeviceAPI.cuh b/include/flamegpu/runtime/DeviceAPI.cuh
index c3462947f..09964fdc9 100644
--- a/include/flamegpu/runtime/DeviceAPI.cuh
+++ b/include/flamegpu/runtime/DeviceAPI.cuh
@@ -47,14 +47,14 @@ class ReadOnlyDeviceAPI {
         const detail::curve::CurveTable *,
 #endif
         const unsigned int,
-        curandState *,
+        curandStateFLAMEGPU *,
         unsigned int *);
 
  public:
     /**
      * @param d_rng Pointer to the device random state buffer to be used
      */
-    __device__ ReadOnlyDeviceAPI(curandState *&d_rng)
+    __device__ ReadOnlyDeviceAPI(curandStateFLAMEGPU *&d_rng)
         : random(AgentRandom(&d_rng[getThreadIndex()]))
         , environment(DeviceEnvironment()) { }
     /**
@@ -154,7 +154,7 @@ class DeviceAPI {
         const unsigned int,
         const void *,
         const void *,
-        curandState *,
+        curandStateFLAMEGPU *,
         unsigned int *,
         unsigned int *,
         unsigned int *);
@@ -238,7 +238,7 @@ class DeviceAPI {
      */
     __device__ DeviceAPI(
         id_t *&d_agent_output_nextID,
-        curandState *&d_rng,
+        curandStateFLAMEGPU *&d_rng,
         unsigned int *&scanFlag_agentOutput,
         typename MessageIn::In &&message_in,
         typename MessageOut::Out &&message_out)
diff --git a/include/flamegpu/runtime/utility/AgentRandom.cuh b/include/flamegpu/runtime/utility/AgentRandom.cuh
index 853bb3d5c..9a3f3e1cc 100644
--- a/include/flamegpu/runtime/utility/AgentRandom.cuh
+++ b/include/flamegpu/runtime/utility/AgentRandom.cuh
@@ -1,9 +1,9 @@
 #ifndef INCLUDE_FLAMEGPU_RUNTIME_UTILITY_AGENTRANDOM_CUH_
 #define INCLUDE_FLAMEGPU_RUNTIME_UTILITY_AGENTRANDOM_CUH_
 
-#include <curand_kernel.h>
 #include <cassert>
 
+#include "flamegpu/util/detail/curand.cuh"
 #include "flamegpu/util/detail/StaticAssert.h"
 #include "flamegpu/exception/FLAMEGPUDeviceException.cuh"
 
@@ -21,7 +21,7 @@ class AgentRandom {
      * @param d_rng ThreadSafe device curand state instance
      *   this is a unique instance for the thread among all concurrently executing kernels
      */
-    __forceinline__ __device__ AgentRandom(curandState *d_rng);
+    __forceinline__ __device__ AgentRandom(curandStateFLAMEGPU *d_rng);
     /**
      * Returns a float uniformly distributed between 0.0 and 1.0. 
      * @note It may return from 0.0 to 1.0, where 1.0 is included and 0.0 is excluded.
@@ -56,10 +56,10 @@ class AgentRandom {
     /**
      * Thread-safe index for accessing curand
      */
-    curandState *d_random_state;
+     curandStateFLAMEGPU *d_random_state;
 };
 
-__forceinline__ __device__ AgentRandom::AgentRandom(curandState *d_rng) : d_random_state(d_rng) { }
+__forceinline__ __device__ AgentRandom::AgentRandom(curandStateFLAMEGPU *d_rng) : d_random_state(d_rng) { }
 /**
  * All templates are specialised
  */
diff --git a/include/flamegpu/runtime/utility/RandomManager.cuh b/include/flamegpu/runtime/utility/RandomManager.cuh
index 1a5583857..9f842645a 100644
--- a/include/flamegpu/runtime/utility/RandomManager.cuh
+++ b/include/flamegpu/runtime/utility/RandomManager.cuh
@@ -1,11 +1,11 @@
 #ifndef INCLUDE_FLAMEGPU_RUNTIME_UTILITY_RANDOMMANAGER_CUH_
 #define INCLUDE_FLAMEGPU_RUNTIME_UTILITY_RANDOMMANAGER_CUH_
 
-#include <curand_kernel.h>
 #include <cstdint>
 #include <random>
 #include <string>
 
+#include "flamegpu/util/detail/curand.cuh"
 #include "flamegpu/sim/Simulation.h"
 
 namespace flamegpu {
@@ -62,7 +62,7 @@ class RandomManager {
      *     while(length*shrinkModifier>_length)
      *       length*=shrinkModifier
      */
-    curandState *resize(size_type _length, cudaStream_t stream);
+    curandStateFLAMEGPU *resize(size_type _length, cudaStream_t stream);
     /**
      * Accessors
      */
@@ -84,14 +84,14 @@ class RandomManager {
      */
     size_type size();
     uint64_t seed();
-    curandState *cudaRandomState();
+    curandStateFLAMEGPU *cudaRandomState();
 
  private:
     /**
      * Device array holding curand states
      * They should always be initialised
      */
-    curandState *d_random_state = nullptr;
+    curandStateFLAMEGPU *d_random_state = nullptr;
     /**
      * Random seed used to initialise all currently allocated curand states
      */
@@ -127,7 +127,7 @@ class RandomManager {
      * @note h_max_random_state will be allocated to length h_max_random_size
      * However, it will only be initialised from hd_random_size(aka length) onwards
      */
-    curandState *h_max_random_state = nullptr;
+    curandStateFLAMEGPU *h_max_random_state = nullptr;
     /**
      * Allocated length of h_max_random_state
      */
diff --git a/include/flamegpu/util/detail/curand.cuh b/include/flamegpu/util/detail/curand.cuh
new file mode 100644
index 000000000..107f1f15f
--- /dev/null
+++ b/include/flamegpu/util/detail/curand.cuh
@@ -0,0 +1,18 @@
+#ifndef INCLUDE_FLAMEGPU_UTIL_DETAIL_CURAND_CUH_
+#define INCLUDE_FLAMEGPU_UTIL_DETAIL_CURAND_CUH_
+
+/**
+ * This header exists to allow a convenient way to switch between curand implementations
+ */
+
+#include <curand_kernel.h>
+
+#if defined(CURAND_MRG32k3a)
+typedef curandStateMRG32k3a_t curandStateFLAMEGPU;
+#elif defined(CURAND_Philox4_32_10)
+typedef curandStatePhilox4_32_10_t curandStateFLAMEGPU;
+#else  // defined(CURAND_XORWOW)
+typedef curandStateXORWOW_t curandStateFLAMEGPU;
+#endif
+
+#endif  // INCLUDE_FLAMEGPU_UTIL_DETAIL_CURAND_CUH_
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 54b4b4f15..fc107cfa1 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -201,6 +201,7 @@ SET(SRC_INCLUDE
     ${FLAMEGPU_ROOT}/include/flamegpu/util/StringPair.h
     ${FLAMEGPU_ROOT}/include/flamegpu/util/type_decode.h
     ${FLAMEGPU_ROOT}/include/flamegpu/util/detail/compute_capability.cuh
+    ${FLAMEGPU_ROOT}/include/flamegpu/util/detail/curand.cuh
     ${FLAMEGPU_ROOT}/include/flamegpu/util/detail/wddm.cuh
     ${FLAMEGPU_ROOT}/include/flamegpu/util/detail/CUDAEventTimer.cuh
     ${FLAMEGPU_ROOT}/include/flamegpu/util/detail/cxxname.hpp
diff --git a/src/flamegpu/gpu/CUDASimulation.cu b/src/flamegpu/gpu/CUDASimulation.cu
index bfd4b2eb4..88a2e0e82 100644
--- a/src/flamegpu/gpu/CUDASimulation.cu
+++ b/src/flamegpu/gpu/CUDASimulation.cu
@@ -1,10 +1,10 @@
 #include "flamegpu/gpu/CUDASimulation.h"
 
-#include <curand_kernel.h>
 
 #include <algorithm>
 #include <string>
 
+#include "flamegpu/util/detail/curand.cuh"
 #include "flamegpu/model/AgentFunctionData.cuh"
 #include "flamegpu/model/LayerData.h"
 #include "flamegpu/model/AgentDescription.h"
@@ -667,7 +667,7 @@ void CUDASimulation::stepLayer(const std::shared_ptr<LayerData>& layer, const un
     // If any condition kernel needs to be executed, do so, by checking the number of threads from before.
     if (totalThreads > 0) {
         // Ensure RandomManager is the correct size to accommodate all threads to be launched
-        curandState *d_rng = singletons->rng.resize(totalThreads, getStream(0));
+        curandStateFLAMEGPU *d_rng = singletons->rng.resize(totalThreads, getStream(0));
         // Track which stream to use for concurrency
         streamIdx = 0;
         // Sum the total number of threads being launched in the layer, for rng offsetting.
@@ -696,7 +696,7 @@ void CUDASimulation::stepLayer(const std::shared_ptr<LayerData>& layer, const un
                 int gridSize = 0;  // The actual grid size needed, based on input size
 
                 //  Agent function condition kernel wrapper args
-                curandState *t_rng = d_rng + totalThreads;
+                curandStateFLAMEGPU *t_rng = d_rng + totalThreads;
                 unsigned int *scanFlag_agentDeath = this->singletons->scatter.Scan().Config(CUDAScanCompaction::Type::AGENT_DEATH, streamIdx).d_ptrs.scan_flag;
 #if !defined(SEATBELTS) || SEATBELTS
                 auto *error_buffer = this->singletons->exception.getDevicePtr(streamIdx, this->getStream(streamIdx));
@@ -865,7 +865,7 @@ void CUDASimulation::stepLayer(const std::shared_ptr<LayerData>& layer, const un
     // If any kernel needs to be executed, do so, by checking the number of threads from before.
     if (totalThreads > 0) {
         // Ensure RandomManager is the correct size to accommodate all threads to be launched
-        curandState *d_rng = singletons->rng.resize(totalThreads, getStream(0));
+        curandStateFLAMEGPU *d_rng = singletons->rng.resize(totalThreads, getStream(0));
         // Total threads is now used to provide kernel launches an offset to thread-safe thread-index
         totalThreads = 0;
         streamIdx = 0;
@@ -918,7 +918,7 @@ void CUDASimulation::stepLayer(const std::shared_ptr<LayerData>& layer, const un
             int gridSize = 0;  // The actual grid size needed, based on input size
 
             // Agent function kernel wrapper args
-            curandState * t_rng = d_rng + totalThreads;
+            curandStateFLAMEGPU *t_rng = d_rng + totalThreads;
             unsigned int *scanFlag_agentDeath = func_des->has_agent_death ? this->singletons->scatter.Scan().Config(CUDAScanCompaction::Type::AGENT_DEATH, streamIdx).d_ptrs.scan_flag : nullptr;
             unsigned int *scanFlag_messageOutput = this->singletons->scatter.Scan().Config(CUDAScanCompaction::Type::MESSAGE_OUTPUT, streamIdx).d_ptrs.scan_flag;
             unsigned int *scanFlag_agentOutput = this->singletons->scatter.Scan().Config(CUDAScanCompaction::Type::AGENT_OUTPUT, streamIdx).d_ptrs.scan_flag;
diff --git a/src/flamegpu/runtime/utility/RandomManager.cu b/src/flamegpu/runtime/utility/RandomManager.cu
index ba4970da6..643a7c9b7 100644
--- a/src/flamegpu/runtime/utility/RandomManager.cu
+++ b/src/flamegpu/runtime/utility/RandomManager.cu
@@ -1,7 +1,6 @@
 #include "flamegpu/runtime/utility/RandomManager.cuh"
 
 #include <cuda_runtime.h>
-#include <curand_kernel.h>
 #include <device_launch_parameters.h>
 
 #include<ctime>
@@ -10,6 +9,7 @@
 #include <cstdio>
 #include <algorithm>
 
+#include "flamegpu/util/detail/curand.cuh"
 #include "flamegpu/gpu/detail/CUDAErrorChecking.cuh"
 #include "flamegpu/gpu/CUDASimulation.h"
 
@@ -81,7 +81,7 @@ void RandomManager::free() {
     freeDevice();
 }
 
-curandState *RandomManager::resize(size_type _length, cudaStream_t stream) {
+curandStateFLAMEGPU *RandomManager::resize(size_type _length, cudaStream_t stream) {
     assert(growthModifier > 1.0);
     assert(shrinkModifier > 0.0);
     assert(shrinkModifier <= 1.0);
@@ -104,7 +104,7 @@ curandState *RandomManager::resize(size_type _length, cudaStream_t stream) {
         resizeDeviceArray(t_length, stream);
     return d_random_state;
 }
-__global__ void init_curand(curandState *d_random_state, unsigned int threadCount, uint64_t seed, RandomManager::size_type offset) {
+__global__ void init_curand(curandStateFLAMEGPU *d_random_state, unsigned int threadCount, uint64_t seed, RandomManager::size_type offset) {
     int id = blockIdx.x * blockDim.x + threadIdx.x;
     if (id < threadCount)
         curand_init(seed, offset + id, 0, &d_random_state[offset + id]);
@@ -114,12 +114,12 @@ void RandomManager::resizeDeviceArray(const size_type &_length, cudaStream_t str
     deviceInitialised = true;
     if (_length > h_max_random_size) {
         // Growing array
-        curandState *t_hd_random_state = nullptr;
+        curandStateFLAMEGPU *t_hd_random_state = nullptr;
         // Allocate new mem to t_hd
-        gpuErrchk(cudaMalloc(&t_hd_random_state, _length * sizeof(curandState)));
+        gpuErrchk(cudaMalloc(&t_hd_random_state, _length * sizeof(curandStateFLAMEGPU)));
         // Copy hd->t_hd[****    ]
         if (d_random_state) {
-            gpuErrchk(cudaMemcpyAsync(t_hd_random_state, d_random_state, length * sizeof(curandState), cudaMemcpyDeviceToDevice, stream));
+            gpuErrchk(cudaMemcpyAsync(t_hd_random_state, d_random_state, length * sizeof(curandStateFLAMEGPU), cudaMemcpyDeviceToDevice, stream));
         }
         // Update pointers hd=t_hd
         if (d_random_state) {
@@ -131,7 +131,7 @@ void RandomManager::resizeDeviceArray(const size_type &_length, cudaStream_t str
             // We have part/all host backup, copy to device array
             // Reinit backup[    **  ]
             const size_type copy_len = std::min(h_max_random_size, _length);
-            gpuErrchk(cudaMemcpyAsync(d_random_state + length, h_max_random_state + length, copy_len * sizeof(curandState), cudaMemcpyHostToDevice, stream));  // Host not pinned
+            gpuErrchk(cudaMemcpyAsync(d_random_state + length, h_max_random_state + length, copy_len * sizeof(curandStateFLAMEGPU), cudaMemcpyHostToDevice, stream));  // Host not pinned
             length += copy_len;
         }
         if (_length > length) {
@@ -143,20 +143,20 @@ void RandomManager::resizeDeviceArray(const size_type &_length, cudaStream_t str
         }
     } else {
         // Shrinking array
-        curandState *t_hd_random_state = nullptr;
-        curandState *t_h_max_random_state = nullptr;
+        curandStateFLAMEGPU *t_hd_random_state = nullptr;
+        curandStateFLAMEGPU *t_h_max_random_state = nullptr;
         // Allocate new
-        gpuErrchk(cudaMalloc(&t_hd_random_state, _length * sizeof(curandState)));
+        gpuErrchk(cudaMalloc(&t_hd_random_state, _length * sizeof(curandStateFLAMEGPU)));
         // Allocate host backup
         if (length > h_max_random_size)
-            t_h_max_random_state = reinterpret_cast<curandState *>(malloc(length * sizeof(curandState)));
+            t_h_max_random_state = reinterpret_cast<curandStateFLAMEGPU*>(malloc(length * sizeof(curandStateFLAMEGPU)));
         else
             t_h_max_random_state = h_max_random_state;
         // Copy old->new
         assert(d_random_state);
-        gpuErrchk(cudaMemcpyAsync(t_hd_random_state, d_random_state, _length * sizeof(curandState), cudaMemcpyDeviceToDevice, stream));
+        gpuErrchk(cudaMemcpyAsync(t_hd_random_state, d_random_state, _length * sizeof(curandStateFLAMEGPU), cudaMemcpyDeviceToDevice, stream));
         // Copy part being shrunk away to host storage (This could be async with above memcpy?)
-        gpuErrchk(cudaMemcpyAsync(t_h_max_random_state + _length, d_random_state + _length, (length - _length) * sizeof(curandState), cudaMemcpyDeviceToHost, stream));
+        gpuErrchk(cudaMemcpyAsync(t_h_max_random_state + _length, d_random_state + _length, (length - _length) * sizeof(curandStateFLAMEGPU), cudaMemcpyDeviceToHost, stream));
         // Release and replace old host ptr
         if (length > h_max_random_size) {
             if (h_max_random_state)
@@ -196,7 +196,7 @@ RandomManager::size_type RandomManager::size() {
 uint64_t RandomManager::seed() {
     return mSeed;
 }
-curandState *RandomManager::cudaRandomState() {
+curandStateFLAMEGPU *RandomManager::cudaRandomState() {
     return d_random_state;
 }
 
diff --git a/src/flamegpu/util/detail/JitifyCache.cu b/src/flamegpu/util/detail/JitifyCache.cu
index 57ad1230f..e9110a308 100644
--- a/src/flamegpu/util/detail/JitifyCache.cu
+++ b/src/flamegpu/util/detail/JitifyCache.cu
@@ -299,6 +299,15 @@ std::unique_ptr<KernelInstantiation> JitifyCache::compileKernel(const std::strin
     }
 #endif
 
+    // Forward the curand Engine request
+#if defined(CURAND_MRG32k3a)
+    options.push_back(std::string("-DCURAND_MRG32k3a"));
+#elif defined(CURAND_Philox4_32_10)
+    options.push_back(std::string("-DCURAND_Philox4_32_10"));
+#elif defined(CURAND_XORWOW)
+    options.push_back(std::string("-DCURAND_XORWOW"));
+#endif
+
     // Set the cuda compuate capability architecture to optimize / generate for, based on the values supported by the current dynamiclaly linked nvrtc and the device in question.
     std::vector<int> nvrtcArchitectures = util::detail::compute_capability::getNVRTCSupportedComputeCapabilties();
     if (nvrtcArchitectures.size()) {
@@ -466,6 +475,16 @@ std::unique_ptr<KernelInstantiation> JitifyCache::loadKernel(const std::string &
         arch + "_" +
         seatbelts + "_" +
         std::string(flamegpu::VERSION_FULL) + "_" +
+#ifdef USE_GLM
+        "glm_" +
+#endif
+#if defined(CURAND_MRG32k3a)
+        "MRG_" +
+#elif defined(CURAND_Philox4_32_10)
+        "PHILOX_" +
+#elif defined(CURAND_XORWOW)
+        "XORWOW_" +
+#endif
         // Use jitify hash methods for consistent hashing between OSs
         std::to_string(hash_combine(hash_larson64(kernel_src.c_str()), hash_larson64(dynamic_header.c_str())));
     // Does a copy with the right reference exist in memory?

From 3c3f3ec8cfdcc5fa7921f2ff5bac3f5bc89518ea Mon Sep 17 00:00:00 2001
From: Robert Chisholm <robadob@robadob.org>
Date: Fri, 17 Jun 2022 17:31:37 +0100
Subject: [PATCH 2/3] Add CMake option CURAND_ENGINE

String with suitable values MRG, PHILOX and XORWOW (case-insensitive), other values will produce an error at configure time.
---
 CMakeLists.txt                                |  4 +++
 README.md                                     |  1 +
 cmake/common.cmake                            | 12 +++++++++
 include/flamegpu/runtime/AgentFunction.cuh    |  4 +--
 .../runtime/AgentFunctionCondition.cuh        |  4 +--
 include/flamegpu/runtime/DeviceAPI.cuh        |  8 +++---
 .../flamegpu/runtime/utility/AgentRandom.cuh  |  6 ++---
 .../runtime/utility/RandomManager.cuh         |  8 +++---
 include/flamegpu/util/detail/curand.cuh       | 14 +++++++---
 src/flamegpu/gpu/CUDASimulation.cu            |  8 +++---
 src/flamegpu/runtime/utility/RandomManager.cu | 26 +++++++++----------
 swig/python/CMakeLists.txt                    | 14 +++++++++-
 12 files changed, 73 insertions(+), 36 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b02e2301d..6feb2d682 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -118,6 +118,10 @@ get_property(isMultiConfig GLOBAL PROPERTY GENERATOR_IS_MULTI_CONFIG)
   unset(isMultiConfig)
 endif()
 
+# Option to change curand engine used for CUDA random generation
+set(CURAND_ENGINE "XORWOW" CACHE STRING "The curand engine to use. Suitable options: \"XORWOW\", \"PHILOX\", \"MRG\"")
+mark_as_advanced(CURAND_ENGINE)
+
 # If CUDA >= 11.2, add an option to enable using NVCC_THREASD
 if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 11.2)
     option(USE_NVCC_THREADS "Enable parallel compilation of multiple NVCC targets. See NVCC_THREADS for more control." ON)
diff --git a/README.md b/README.md
index d0defda8f..c6fc61059 100644
--- a/README.md
+++ b/README.md
@@ -170,6 +170,7 @@ cmake --build . --target all
 | `EXPORT_RTC_SOURCES`     | `ON`/`OFF`        | At runtime, export dynamic RTC files to disk. Useful for debugging RTC models. Default `OFF`               |
 | `RTC_DISK_CACHE`         | `ON`/`OFF`        | Enable/Disable caching of RTC functions to disk. Default `ON`.                                             |
 | `VERBOSE_PTXAS`          | `ON`/`OFF`        | Enable verbose PTXAS output during compilation. Default `OFF`.                                             |
+| `CURAND_ENGINE`          | `XORWOW`/`PHILOX`/`MRG` | Select the CUDA random engine. Default `XORWOW`                                                      |
 | `USE_GLM`                | `ON`/`OFF`        | Experimental feature for GLM type support in RTC models. Default `OFF`.                                    |
 
 <!-- Additional options which users can find if they need them.
diff --git a/cmake/common.cmake b/cmake/common.cmake
index a6d99cc51..bbd0da199 100644
--- a/cmake/common.cmake
+++ b/cmake/common.cmake
@@ -270,6 +270,18 @@ function(CommonCompilerSettings)
     if(VERBOSE_PTXAS)
         target_compile_options(${CCS_TARGET} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xptxas -v>")
     endif()
+    
+    # Request a specific curand engine
+    string(TOUPPER CURAND_ENGINE CURAND_ENGINE_UPPER)
+    if(${CURAND_ENGINE_UPPER} STREQUAL "MRG")
+        target_compile_definitions(${CCS_TARGET} PRIVATE CURAND_MRG32k3a)
+    elseif(${CURAND_ENGINE_UPPER} STREQUAL "PHILOX")
+        target_compile_definitions(${CCS_TARGET} PRIVATE CURAND_Philox4_32_10)
+    elseif(${CURAND_ENGINE_UPPER} STREQUAL "XORWOW")
+        target_compile_definitions(${CCS_TARGET} PRIVATE CURAND_XORWOW)
+    elseif(DEFINED CURAND_ENGINE)
+        message(FATAL_ERROR "${CURAND_ENGINE} is not a suitable value of CURAND_ENGINE\nOptions: \"MRG\", \"PHILOX\", \"XORWOW\"")
+    endif()
 
 endfunction()
 
diff --git a/include/flamegpu/runtime/AgentFunction.cuh b/include/flamegpu/runtime/AgentFunction.cuh
index ce19476a8..4cf5e3c91 100644
--- a/include/flamegpu/runtime/AgentFunction.cuh
+++ b/include/flamegpu/runtime/AgentFunction.cuh
@@ -27,7 +27,7 @@ typedef void(AgentFunctionWrapper)(
     const unsigned int popNo,
     const void *in_messagelist_metadata,
     const void *out_messagelist_metadata,
-    curandStateFLAMEGPU *d_rng,
+    util::detail::curandState *d_rng,
     unsigned int *scanFlag_agentDeath,
     unsigned int *scanFlag_messageOutput,
     unsigned int *scanFlag_agentOutput);  // Can't put __global__ in a typedef
@@ -63,7 +63,7 @@ __global__ void agent_function_wrapper(
     const unsigned int popNo,
     const void *in_messagelist_metadata,
     const void *out_messagelist_metadata,
-    curandStateFLAMEGPU *d_rng,
+    util::detail::curandState *d_rng,
     unsigned int *scanFlag_agentDeath,
     unsigned int *scanFlag_messageOutput,
     unsigned int *scanFlag_agentOutput) {
diff --git a/include/flamegpu/runtime/AgentFunctionCondition.cuh b/include/flamegpu/runtime/AgentFunctionCondition.cuh
index 72bea6a22..f6c59f1a1 100644
--- a/include/flamegpu/runtime/AgentFunctionCondition.cuh
+++ b/include/flamegpu/runtime/AgentFunctionCondition.cuh
@@ -21,7 +21,7 @@ typedef void(AgentFunctionConditionWrapper)(
     const char* d_env_buffer,
 #endif
     const unsigned int popNo,
-    curandStateFLAMEGPU *d_rng,
+    util::detail::curandState *d_rng,
     unsigned int *scanFlag_conditionResult);  // Can't put __global__ in a typedef
 
 /**
@@ -46,7 +46,7 @@ __global__ void agent_function_condition_wrapper(
     const char* d_env_buffer,
 #endif
     const unsigned int popNo,
-    curandStateFLAMEGPU *d_rng,
+    util::detail::curandState *d_rng,
     unsigned int *scanFlag_conditionResult) {
     // We place these at the start of shared memory, so we can locate it anywhere in device code without a reference
     using detail::sm;
diff --git a/include/flamegpu/runtime/DeviceAPI.cuh b/include/flamegpu/runtime/DeviceAPI.cuh
index 09964fdc9..2a35ce1e4 100644
--- a/include/flamegpu/runtime/DeviceAPI.cuh
+++ b/include/flamegpu/runtime/DeviceAPI.cuh
@@ -47,14 +47,14 @@ class ReadOnlyDeviceAPI {
         const detail::curve::CurveTable *,
 #endif
         const unsigned int,
-        curandStateFLAMEGPU *,
+        util::detail::curandState *,
         unsigned int *);
 
  public:
     /**
      * @param d_rng Pointer to the device random state buffer to be used
      */
-    __device__ ReadOnlyDeviceAPI(curandStateFLAMEGPU *&d_rng)
+    __device__ ReadOnlyDeviceAPI(util::detail::curandState *&d_rng)
         : random(AgentRandom(&d_rng[getThreadIndex()]))
         , environment(DeviceEnvironment()) { }
     /**
@@ -154,7 +154,7 @@ class DeviceAPI {
         const unsigned int,
         const void *,
         const void *,
-        curandStateFLAMEGPU *,
+        util::detail::curandState *,
         unsigned int *,
         unsigned int *,
         unsigned int *);
@@ -238,7 +238,7 @@ class DeviceAPI {
      */
     __device__ DeviceAPI(
         id_t *&d_agent_output_nextID,
-        curandStateFLAMEGPU *&d_rng,
+        util::detail::curandState *&d_rng,
         unsigned int *&scanFlag_agentOutput,
         typename MessageIn::In &&message_in,
         typename MessageOut::Out &&message_out)
diff --git a/include/flamegpu/runtime/utility/AgentRandom.cuh b/include/flamegpu/runtime/utility/AgentRandom.cuh
index 9a3f3e1cc..474ce5ff4 100644
--- a/include/flamegpu/runtime/utility/AgentRandom.cuh
+++ b/include/flamegpu/runtime/utility/AgentRandom.cuh
@@ -21,7 +21,7 @@ class AgentRandom {
      * @param d_rng ThreadSafe device curand state instance
      *   this is a unique instance for the thread among all concurrently executing kernels
      */
-    __forceinline__ __device__ AgentRandom(curandStateFLAMEGPU *d_rng);
+    __forceinline__ __device__ AgentRandom(util::detail::curandState *d_rng);
     /**
      * Returns a float uniformly distributed between 0.0 and 1.0. 
      * @note It may return from 0.0 to 1.0, where 1.0 is included and 0.0 is excluded.
@@ -56,10 +56,10 @@ class AgentRandom {
     /**
      * Thread-safe index for accessing curand
      */
-     curandStateFLAMEGPU *d_random_state;
+    util::detail::curandState *d_random_state;
 };
 
-__forceinline__ __device__ AgentRandom::AgentRandom(curandStateFLAMEGPU *d_rng) : d_random_state(d_rng) { }
+__forceinline__ __device__ AgentRandom::AgentRandom(util::detail::curandState *d_rng) : d_random_state(d_rng) { }
 /**
  * All templates are specialised
  */
diff --git a/include/flamegpu/runtime/utility/RandomManager.cuh b/include/flamegpu/runtime/utility/RandomManager.cuh
index 9f842645a..4d29bc71a 100644
--- a/include/flamegpu/runtime/utility/RandomManager.cuh
+++ b/include/flamegpu/runtime/utility/RandomManager.cuh
@@ -62,7 +62,7 @@ class RandomManager {
      *     while(length*shrinkModifier>_length)
      *       length*=shrinkModifier
      */
-    curandStateFLAMEGPU *resize(size_type _length, cudaStream_t stream);
+    util::detail::curandState*resize(size_type _length, cudaStream_t stream);
     /**
      * Accessors
      */
@@ -84,14 +84,14 @@ class RandomManager {
      */
     size_type size();
     uint64_t seed();
-    curandStateFLAMEGPU *cudaRandomState();
+    util::detail::curandState*cudaRandomState();
 
  private:
     /**
      * Device array holding curand states
      * They should always be initialised
      */
-    curandStateFLAMEGPU *d_random_state = nullptr;
+     util::detail::curandState*d_random_state = nullptr;
     /**
      * Random seed used to initialise all currently allocated curand states
      */
@@ -127,7 +127,7 @@ class RandomManager {
      * @note h_max_random_state will be allocated to length h_max_random_size
      * However, it will only be initialised from hd_random_size(aka length) onwards
      */
-    curandStateFLAMEGPU *h_max_random_state = nullptr;
+    util::detail::curandState *h_max_random_state = nullptr;
     /**
      * Allocated length of h_max_random_state
      */
diff --git a/include/flamegpu/util/detail/curand.cuh b/include/flamegpu/util/detail/curand.cuh
index 107f1f15f..e4ccc5a37 100644
--- a/include/flamegpu/util/detail/curand.cuh
+++ b/include/flamegpu/util/detail/curand.cuh
@@ -7,12 +7,20 @@
 
 #include <curand_kernel.h>
 
+namespace flamegpu {
+namespace util {
+namespace detail {
+
 #if defined(CURAND_MRG32k3a)
-typedef curandStateMRG32k3a_t curandStateFLAMEGPU;
+typedef curandStateMRG32k3a_t curandState;
 #elif defined(CURAND_Philox4_32_10)
-typedef curandStatePhilox4_32_10_t curandStateFLAMEGPU;
+typedef curandStatePhilox4_32_10_t curandState;
 #else  // defined(CURAND_XORWOW)
-typedef curandStateXORWOW_t curandStateFLAMEGPU;
+typedef curandStateXORWOW_t curandState;
 #endif
 
+}  // namespace detail
+}  // namespace util
+}  // namespace flamegpu
+
 #endif  // INCLUDE_FLAMEGPU_UTIL_DETAIL_CURAND_CUH_
diff --git a/src/flamegpu/gpu/CUDASimulation.cu b/src/flamegpu/gpu/CUDASimulation.cu
index 88a2e0e82..e687c7232 100644
--- a/src/flamegpu/gpu/CUDASimulation.cu
+++ b/src/flamegpu/gpu/CUDASimulation.cu
@@ -667,7 +667,7 @@ void CUDASimulation::stepLayer(const std::shared_ptr<LayerData>& layer, const un
     // If any condition kernel needs to be executed, do so, by checking the number of threads from before.
     if (totalThreads > 0) {
         // Ensure RandomManager is the correct size to accommodate all threads to be launched
-        curandStateFLAMEGPU *d_rng = singletons->rng.resize(totalThreads, getStream(0));
+        util::detail::curandState *d_rng = singletons->rng.resize(totalThreads, getStream(0));
         // Track which stream to use for concurrency
         streamIdx = 0;
         // Sum the total number of threads being launched in the layer, for rng offsetting.
@@ -696,7 +696,7 @@ void CUDASimulation::stepLayer(const std::shared_ptr<LayerData>& layer, const un
                 int gridSize = 0;  // The actual grid size needed, based on input size
 
                 //  Agent function condition kernel wrapper args
-                curandStateFLAMEGPU *t_rng = d_rng + totalThreads;
+                util::detail::curandState *t_rng = d_rng + totalThreads;
                 unsigned int *scanFlag_agentDeath = this->singletons->scatter.Scan().Config(CUDAScanCompaction::Type::AGENT_DEATH, streamIdx).d_ptrs.scan_flag;
 #if !defined(SEATBELTS) || SEATBELTS
                 auto *error_buffer = this->singletons->exception.getDevicePtr(streamIdx, this->getStream(streamIdx));
@@ -865,7 +865,7 @@ void CUDASimulation::stepLayer(const std::shared_ptr<LayerData>& layer, const un
     // If any kernel needs to be executed, do so, by checking the number of threads from before.
     if (totalThreads > 0) {
         // Ensure RandomManager is the correct size to accommodate all threads to be launched
-        curandStateFLAMEGPU *d_rng = singletons->rng.resize(totalThreads, getStream(0));
+        util::detail::curandState *d_rng = singletons->rng.resize(totalThreads, getStream(0));
         // Total threads is now used to provide kernel launches an offset to thread-safe thread-index
         totalThreads = 0;
         streamIdx = 0;
@@ -918,7 +918,7 @@ void CUDASimulation::stepLayer(const std::shared_ptr<LayerData>& layer, const un
             int gridSize = 0;  // The actual grid size needed, based on input size
 
             // Agent function kernel wrapper args
-            curandStateFLAMEGPU *t_rng = d_rng + totalThreads;
+            util::detail::curandState *t_rng = d_rng + totalThreads;
             unsigned int *scanFlag_agentDeath = func_des->has_agent_death ? this->singletons->scatter.Scan().Config(CUDAScanCompaction::Type::AGENT_DEATH, streamIdx).d_ptrs.scan_flag : nullptr;
             unsigned int *scanFlag_messageOutput = this->singletons->scatter.Scan().Config(CUDAScanCompaction::Type::MESSAGE_OUTPUT, streamIdx).d_ptrs.scan_flag;
             unsigned int *scanFlag_agentOutput = this->singletons->scatter.Scan().Config(CUDAScanCompaction::Type::AGENT_OUTPUT, streamIdx).d_ptrs.scan_flag;
diff --git a/src/flamegpu/runtime/utility/RandomManager.cu b/src/flamegpu/runtime/utility/RandomManager.cu
index 643a7c9b7..719862381 100644
--- a/src/flamegpu/runtime/utility/RandomManager.cu
+++ b/src/flamegpu/runtime/utility/RandomManager.cu
@@ -81,7 +81,7 @@ void RandomManager::free() {
     freeDevice();
 }
 
-curandStateFLAMEGPU *RandomManager::resize(size_type _length, cudaStream_t stream) {
+util::detail::curandState *RandomManager::resize(size_type _length, cudaStream_t stream) {
     assert(growthModifier > 1.0);
     assert(shrinkModifier > 0.0);
     assert(shrinkModifier <= 1.0);
@@ -104,7 +104,7 @@ curandStateFLAMEGPU *RandomManager::resize(size_type _length, cudaStream_t strea
         resizeDeviceArray(t_length, stream);
     return d_random_state;
 }
-__global__ void init_curand(curandStateFLAMEGPU *d_random_state, unsigned int threadCount, uint64_t seed, RandomManager::size_type offset) {
+__global__ void init_curand(util::detail::curandState *d_random_state, unsigned int threadCount, uint64_t seed, RandomManager::size_type offset) {
     int id = blockIdx.x * blockDim.x + threadIdx.x;
     if (id < threadCount)
         curand_init(seed, offset + id, 0, &d_random_state[offset + id]);
@@ -114,12 +114,12 @@ void RandomManager::resizeDeviceArray(const size_type &_length, cudaStream_t str
     deviceInitialised = true;
     if (_length > h_max_random_size) {
         // Growing array
-        curandStateFLAMEGPU *t_hd_random_state = nullptr;
+        util::detail::curandState *t_hd_random_state = nullptr;
         // Allocate new mem to t_hd
-        gpuErrchk(cudaMalloc(&t_hd_random_state, _length * sizeof(curandStateFLAMEGPU)));
+        gpuErrchk(cudaMalloc(&t_hd_random_state, _length * sizeof(util::detail::curandState)));
         // Copy hd->t_hd[****    ]
         if (d_random_state) {
-            gpuErrchk(cudaMemcpyAsync(t_hd_random_state, d_random_state, length * sizeof(curandStateFLAMEGPU), cudaMemcpyDeviceToDevice, stream));
+            gpuErrchk(cudaMemcpyAsync(t_hd_random_state, d_random_state, length * sizeof(util::detail::curandState), cudaMemcpyDeviceToDevice, stream));
         }
         // Update pointers hd=t_hd
         if (d_random_state) {
@@ -131,7 +131,7 @@ void RandomManager::resizeDeviceArray(const size_type &_length, cudaStream_t str
             // We have part/all host backup, copy to device array
             // Reinit backup[    **  ]
             const size_type copy_len = std::min(h_max_random_size, _length);
-            gpuErrchk(cudaMemcpyAsync(d_random_state + length, h_max_random_state + length, copy_len * sizeof(curandStateFLAMEGPU), cudaMemcpyHostToDevice, stream));  // Host not pinned
+            gpuErrchk(cudaMemcpyAsync(d_random_state + length, h_max_random_state + length, copy_len * sizeof(util::detail::curandState), cudaMemcpyHostToDevice, stream));  // Host not pinned
             length += copy_len;
         }
         if (_length > length) {
@@ -143,20 +143,20 @@ void RandomManager::resizeDeviceArray(const size_type &_length, cudaStream_t str
         }
     } else {
         // Shrinking array
-        curandStateFLAMEGPU *t_hd_random_state = nullptr;
-        curandStateFLAMEGPU *t_h_max_random_state = nullptr;
+        util::detail::curandState *t_hd_random_state = nullptr;
+        util::detail::curandState *t_h_max_random_state = nullptr;
         // Allocate new
-        gpuErrchk(cudaMalloc(&t_hd_random_state, _length * sizeof(curandStateFLAMEGPU)));
+        gpuErrchk(cudaMalloc(&t_hd_random_state, _length * sizeof(util::detail::curandState)));
         // Allocate host backup
         if (length > h_max_random_size)
-            t_h_max_random_state = reinterpret_cast<curandStateFLAMEGPU*>(malloc(length * sizeof(curandStateFLAMEGPU)));
+            t_h_max_random_state = reinterpret_cast<util::detail::curandState*>(malloc(length * sizeof(util::detail::curandState)));
         else
             t_h_max_random_state = h_max_random_state;
         // Copy old->new
         assert(d_random_state);
-        gpuErrchk(cudaMemcpyAsync(t_hd_random_state, d_random_state, _length * sizeof(curandStateFLAMEGPU), cudaMemcpyDeviceToDevice, stream));
+        gpuErrchk(cudaMemcpyAsync(t_hd_random_state, d_random_state, _length * sizeof(util::detail::curandState), cudaMemcpyDeviceToDevice, stream));
         // Copy part being shrunk away to host storage (This could be async with above memcpy?)
-        gpuErrchk(cudaMemcpyAsync(t_h_max_random_state + _length, d_random_state + _length, (length - _length) * sizeof(curandStateFLAMEGPU), cudaMemcpyDeviceToHost, stream));
+        gpuErrchk(cudaMemcpyAsync(t_h_max_random_state + _length, d_random_state + _length, (length - _length) * sizeof(util::detail::curandState), cudaMemcpyDeviceToHost, stream));
         // Release and replace old host ptr
         if (length > h_max_random_size) {
             if (h_max_random_state)
@@ -196,7 +196,7 @@ RandomManager::size_type RandomManager::size() {
 uint64_t RandomManager::seed() {
     return mSeed;
 }
-curandStateFLAMEGPU *RandomManager::cudaRandomState() {
+util::detail::curandState *RandomManager::cudaRandomState() {
     return d_random_state;
 }
 
diff --git a/swig/python/CMakeLists.txt b/swig/python/CMakeLists.txt
index b40933980..3e3de466e 100644
--- a/swig/python/CMakeLists.txt
+++ b/swig/python/CMakeLists.txt
@@ -123,7 +123,7 @@ if (VISUALISATION)
   set_property(TARGET ${PYTHON_SWIG_TARGET_NAME} APPEND PROPERTY SWIG_GENERATED_COMPILE_DEFINITIONS VISUALISATION)
 endif()
 if (SEATBELTS)
-  # Debug alwasy has seatbelts.
+  # Debug always has seatbelts.
   set_property(TARGET ${PYTHON_SWIG_TARGET_NAME} APPEND PROPERTY SWIG_COMPILE_DEFINITIONS SEATBELTS=1)
   set_property(TARGET ${PYTHON_SWIG_TARGET_NAME} APPEND PROPERTY SWIG_GENERATED_COMPILE_DEFINITIONS SEATBELTS=1)
 else()
@@ -131,6 +131,18 @@ else()
   set_property(TARGET ${PYTHON_SWIG_TARGET_NAME} APPEND PROPERTY SWIG_COMPILE_DEFINITIONS $<IF:$<CONFIG:Debug>,SEATBELTS=1,SEATBELTS=0>)
   set_property(TARGET ${PYTHON_SWIG_TARGET_NAME} APPEND PROPERTY SWIG_GENERATED_COMPILE_DEFINITIONS $<IF:$<CONFIG:Debug>,SEATBELTS=1,SEATBELTS=0>)
 endif()
+# Build SWIG for the corresponding CURAND
+string(TOUPPER CURAND_ENGINE CURAND_ENGINE_UPPER)
+if(${CURAND_ENGINE_UPPER} STREQUAL "MRG")
+  set_property(TARGET ${PYTHON_SWIG_TARGET_NAME} APPEND PROPERTY SWIG_COMPILE_DEFINITIONS CURAND_MRG32k3a)
+  set_property(TARGET ${PYTHON_SWIG_TARGET_NAME} APPEND PROPERTY SWIG_GENERATED_COMPILE_DEFINITIONS CURAND_MRG32k3a)
+elseif(${CURAND_ENGINE_UPPER} STREQUAL "PHILOX")
+  set_property(TARGET ${PYTHON_SWIG_TARGET_NAME} APPEND PROPERTY SWIG_COMPILE_DEFINITIONS CURAND_Philox4_32_10)
+  set_property(TARGET ${PYTHON_SWIG_TARGET_NAME} APPEND PROPERTY SWIG_GENERATED_COMPILE_DEFINITIONS CURAND_Philox4_32_10)
+elseif(${CURAND_ENGINE_UPPER} STREQUAL "XORWOW")
+  set_property(TARGET ${PYTHON_SWIG_TARGET_NAME} APPEND PROPERTY SWIG_COMPILE_DEFINITIONS CURAND_XORWOW)
+  set_property(TARGET ${PYTHON_SWIG_TARGET_NAME} APPEND PROPERTY SWIG_GENERATED_COMPILE_DEFINITIONS CURAND_XORWOW)
+endif()
 if (NOT RTC_DISK_CACHE)
   set_property(TARGET ${PYTHON_SWIG_TARGET_NAME} APPEND PROPERTY SWIG_COMPILE_DEFINITIONS DISABLE_RTC_DISK_CACHE)
   set_property(TARGET ${PYTHON_SWIG_TARGET_NAME} APPEND PROPERTY SWIG_GENERATED_COMPILE_DEFINITIONS DISABLE_RTC_DISK_CACHE)

From ee74445424f9322c01dee2675286d79dfc0d59c6 Mon Sep 17 00:00:00 2001
From: Robert Chisholm <robadob@robadob.org>
Date: Thu, 20 Oct 2022 10:58:33 +0100
Subject: [PATCH 3/3] Breaking Change: Default CUDA random engine switched to
 PHILOX (from XORWOW) Previous behaviour can be retained by setting
 CURAND_ENGINE to XORWOW at CMake configuration time.

---
 CMakeLists.txt                          | 2 +-
 include/flamegpu/util/detail/curand.cuh | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6feb2d682..f0013f3a1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -119,7 +119,7 @@ get_property(isMultiConfig GLOBAL PROPERTY GENERATOR_IS_MULTI_CONFIG)
 endif()
 
 # Option to change curand engine used for CUDA random generation
-set(CURAND_ENGINE "XORWOW" CACHE STRING "The curand engine to use. Suitable options: \"XORWOW\", \"PHILOX\", \"MRG\"")
+set(CURAND_ENGINE "PHILOX" CACHE STRING "The curand engine to use. Suitable options: \"XORWOW\", \"PHILOX\", \"MRG\"")
 mark_as_advanced(CURAND_ENGINE)
 
 # If CUDA >= 11.2, add an option to enable using NVCC_THREASD
diff --git a/include/flamegpu/util/detail/curand.cuh b/include/flamegpu/util/detail/curand.cuh
index e4ccc5a37..79273bef3 100644
--- a/include/flamegpu/util/detail/curand.cuh
+++ b/include/flamegpu/util/detail/curand.cuh
@@ -13,10 +13,10 @@ namespace detail {
 
 #if defined(CURAND_MRG32k3a)
 typedef curandStateMRG32k3a_t curandState;
-#elif defined(CURAND_Philox4_32_10)
-typedef curandStatePhilox4_32_10_t curandState;
-#else  // defined(CURAND_XORWOW)
+#elif defined(CURAND_XORWOW)
 typedef curandStateXORWOW_t curandState;
+#else  // defined(CURAND_Philox4_32_10)
+typedef curandStatePhilox4_32_10_t curandState;
 #endif
 
 }  // namespace detail