Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Curand switch #873

Merged
merged 3 commits into from
Oct 26, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,10 @@ get_property(isMultiConfig GLOBAL PROPERTY GENERATOR_IS_MULTI_CONFIG)
unset(isMultiConfig)
endif()

# Option to change curand engine used for CUDA random generation
set(CURAND_ENGINE "PHILOX" CACHE STRING "The curand engine to use. Suitable options: \"XORWOW\", \"PHILOX\", \"MRG\"")
mark_as_advanced(CURAND_ENGINE)

# If CUDA >= 11.2, add an option to enable using NVCC_THREASD
if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 11.2)
option(USE_NVCC_THREADS "Enable parallel compilation of multiple NVCC targets. See NVCC_THREADS for more control." ON)
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,7 @@ cmake --build . --target all
| `EXPORT_RTC_SOURCES` | `ON`/`OFF` | At runtime, export dynamic RTC files to disk. Useful for debugging RTC models. Default `OFF` |
| `RTC_DISK_CACHE` | `ON`/`OFF` | Enable/Disable caching of RTC functions to disk. Default `ON`. |
| `VERBOSE_PTXAS` | `ON`/`OFF` | Enable verbose PTXAS output during compilation. Default `OFF`. |
| `CURAND_ENGINE` | `XORWOW`/`PHILOX`/`MRG` | Select the CUDA random engine. Default `XORWOW` |
| `USE_GLM` | `ON`/`OFF` | Experimental feature for GLM type support in RTC models. Default `OFF`. |

<!-- Additional options which users can find if they need them.
Expand Down
12 changes: 12 additions & 0 deletions cmake/common.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,18 @@ function(CommonCompilerSettings)
if(VERBOSE_PTXAS)
target_compile_options(${CCS_TARGET} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xptxas -v>")
endif()

# Request a specific curand engine
string(TOUPPER CURAND_ENGINE CURAND_ENGINE_UPPER)
if(${CURAND_ENGINE_UPPER} STREQUAL "MRG")
target_compile_definitions(${CCS_TARGET} PRIVATE CURAND_MRG32k3a)
elseif(${CURAND_ENGINE_UPPER} STREQUAL "PHILOX")
target_compile_definitions(${CCS_TARGET} PRIVATE CURAND_Philox4_32_10)
elseif(${CURAND_ENGINE_UPPER} STREQUAL "XORWOW")
target_compile_definitions(${CCS_TARGET} PRIVATE CURAND_XORWOW)
elseif(DEFINED CURAND_ENGINE)
message(FATAL_ERROR "${CURAND_ENGINE} is not a suitable value of CURAND_ENGINE\nOptions: \"MRG\", \"PHILOX\", \"XORWOW\"")
endif()

endfunction()

Expand Down
8 changes: 3 additions & 5 deletions include/flamegpu/runtime/AgentFunction.cuh
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
#ifndef INCLUDE_FLAMEGPU_RUNTIME_AGENTFUNCTION_CUH_
#define INCLUDE_FLAMEGPU_RUNTIME_AGENTFUNCTION_CUH_

#include <cuda_runtime.h>
#include <curand_kernel.h>

#include "flamegpu/util/detail/curand.cuh"
#include "flamegpu/runtime/detail/SharedBlock.h"
#include "flamegpu/defines.h"
#include "flamegpu/exception/FLAMEGPUDeviceException.cuh"
Expand All @@ -29,7 +27,7 @@ typedef void(AgentFunctionWrapper)(
const unsigned int popNo,
const void *in_messagelist_metadata,
const void *out_messagelist_metadata,
curandState *d_rng,
util::detail::curandState *d_rng,
unsigned int *scanFlag_agentDeath,
unsigned int *scanFlag_messageOutput,
unsigned int *scanFlag_agentOutput); // Can't put __global__ in a typedef
Expand Down Expand Up @@ -65,7 +63,7 @@ __global__ void agent_function_wrapper(
const unsigned int popNo,
const void *in_messagelist_metadata,
const void *out_messagelist_metadata,
curandState *d_rng,
util::detail::curandState *d_rng,
unsigned int *scanFlag_agentDeath,
unsigned int *scanFlag_messageOutput,
unsigned int *scanFlag_agentOutput) {
Expand Down
4 changes: 2 additions & 2 deletions include/flamegpu/runtime/AgentFunctionCondition.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ typedef void(AgentFunctionConditionWrapper)(
const char* d_env_buffer,
#endif
const unsigned int popNo,
curandState *d_rng,
util::detail::curandState *d_rng,
unsigned int *scanFlag_conditionResult); // Can't put __global__ in a typedef

/**
Expand All @@ -46,7 +46,7 @@ __global__ void agent_function_condition_wrapper(
const char* d_env_buffer,
#endif
const unsigned int popNo,
curandState *d_rng,
util::detail::curandState *d_rng,
unsigned int *scanFlag_conditionResult) {
// We place these at the start of shared memory, so we can locate it anywhere in device code without a reference
using detail::sm;
Expand Down
8 changes: 4 additions & 4 deletions include/flamegpu/runtime/DeviceAPI.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -47,14 +47,14 @@ class ReadOnlyDeviceAPI {
const detail::curve::CurveTable *,
#endif
const unsigned int,
curandState *,
util::detail::curandState *,
unsigned int *);

public:
/**
* @param d_rng Pointer to the device random state buffer to be used
*/
__device__ ReadOnlyDeviceAPI(curandState *&d_rng)
__device__ ReadOnlyDeviceAPI(util::detail::curandState *&d_rng)
: random(AgentRandom(&d_rng[getThreadIndex()]))
, environment(DeviceEnvironment()) { }
/**
Expand Down Expand Up @@ -154,7 +154,7 @@ class DeviceAPI {
const unsigned int,
const void *,
const void *,
curandState *,
util::detail::curandState *,
unsigned int *,
unsigned int *,
unsigned int *);
Expand Down Expand Up @@ -238,7 +238,7 @@ class DeviceAPI {
*/
__device__ DeviceAPI(
id_t *&d_agent_output_nextID,
curandState *&d_rng,
util::detail::curandState *&d_rng,
unsigned int *&scanFlag_agentOutput,
typename MessageIn::In &&message_in,
typename MessageOut::Out &&message_out)
Expand Down
8 changes: 4 additions & 4 deletions include/flamegpu/runtime/utility/AgentRandom.cuh
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
#ifndef INCLUDE_FLAMEGPU_RUNTIME_UTILITY_AGENTRANDOM_CUH_
#define INCLUDE_FLAMEGPU_RUNTIME_UTILITY_AGENTRANDOM_CUH_

#include <curand_kernel.h>
#include <cassert>

#include "flamegpu/util/detail/curand.cuh"
#include "flamegpu/util/detail/StaticAssert.h"
#include "flamegpu/exception/FLAMEGPUDeviceException.cuh"

Expand All @@ -21,7 +21,7 @@ class AgentRandom {
* @param d_rng ThreadSafe device curand state instance
* this is a unique instance for the thread among all concurrently executing kernels
*/
__forceinline__ __device__ AgentRandom(curandState *d_rng);
__forceinline__ __device__ AgentRandom(util::detail::curandState *d_rng);
/**
* Returns a float uniformly distributed between 0.0 and 1.0.
* @note It may return from 0.0 to 1.0, where 1.0 is included and 0.0 is excluded.
Expand Down Expand Up @@ -56,10 +56,10 @@ class AgentRandom {
/**
* Thread-safe index for accessing curand
*/
curandState *d_random_state;
util::detail::curandState *d_random_state;
};

__forceinline__ __device__ AgentRandom::AgentRandom(curandState *d_rng) : d_random_state(d_rng) { }
__forceinline__ __device__ AgentRandom::AgentRandom(util::detail::curandState *d_rng) : d_random_state(d_rng) { }
/**
* All templates are specialised
*/
Expand Down
10 changes: 5 additions & 5 deletions include/flamegpu/runtime/utility/RandomManager.cuh
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
#ifndef INCLUDE_FLAMEGPU_RUNTIME_UTILITY_RANDOMMANAGER_CUH_
#define INCLUDE_FLAMEGPU_RUNTIME_UTILITY_RANDOMMANAGER_CUH_

#include <curand_kernel.h>
#include <cstdint>
#include <random>
#include <string>

#include "flamegpu/util/detail/curand.cuh"
#include "flamegpu/sim/Simulation.h"

namespace flamegpu {
Expand Down Expand Up @@ -62,7 +62,7 @@ class RandomManager {
* while(length*shrinkModifier>_length)
* length*=shrinkModifier
*/
curandState *resize(size_type _length, cudaStream_t stream);
util::detail::curandState*resize(size_type _length, cudaStream_t stream);
/**
* Accessors
*/
Expand All @@ -84,14 +84,14 @@ class RandomManager {
*/
size_type size();
uint64_t seed();
curandState *cudaRandomState();
util::detail::curandState*cudaRandomState();

private:
/**
* Device array holding curand states
* They should always be initialised
*/
curandState *d_random_state = nullptr;
util::detail::curandState*d_random_state = nullptr;
/**
* Random seed used to initialise all currently allocated curand states
*/
Expand Down Expand Up @@ -127,7 +127,7 @@ class RandomManager {
* @note h_max_random_state will be allocated to length h_max_random_size
* However, it will only be initialised from hd_random_size(aka length) onwards
*/
curandState *h_max_random_state = nullptr;
util::detail::curandState *h_max_random_state = nullptr;
/**
* Allocated length of h_max_random_state
*/
Expand Down
26 changes: 26 additions & 0 deletions include/flamegpu/util/detail/curand.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#ifndef INCLUDE_FLAMEGPU_UTIL_DETAIL_CURAND_CUH_
#define INCLUDE_FLAMEGPU_UTIL_DETAIL_CURAND_CUH_

/**
* This header exists to allow a convenient way to switch between curand implementations
*/

#include <curand_kernel.h>

Robadob marked this conversation as resolved.
Show resolved Hide resolved
namespace flamegpu {
namespace util {
namespace detail {

#if defined(CURAND_MRG32k3a)
typedef curandStateMRG32k3a_t curandState;
#elif defined(CURAND_XORWOW)
typedef curandStateXORWOW_t curandState;
#else // defined(CURAND_Philox4_32_10)
typedef curandStatePhilox4_32_10_t curandState;
#endif

} // namespace detail
} // namespace util
} // namespace flamegpu

#endif // INCLUDE_FLAMEGPU_UTIL_DETAIL_CURAND_CUH_
1 change: 1 addition & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,7 @@ SET(SRC_INCLUDE
${FLAMEGPU_ROOT}/include/flamegpu/util/StringPair.h
${FLAMEGPU_ROOT}/include/flamegpu/util/type_decode.h
${FLAMEGPU_ROOT}/include/flamegpu/util/detail/compute_capability.cuh
${FLAMEGPU_ROOT}/include/flamegpu/util/detail/curand.cuh
${FLAMEGPU_ROOT}/include/flamegpu/util/detail/wddm.cuh
${FLAMEGPU_ROOT}/include/flamegpu/util/detail/CUDAEventTimer.cuh
${FLAMEGPU_ROOT}/include/flamegpu/util/detail/cxxname.hpp
Expand Down
10 changes: 5 additions & 5 deletions src/flamegpu/gpu/CUDASimulation.cu
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
#include "flamegpu/gpu/CUDASimulation.h"

#include <curand_kernel.h>

#include <algorithm>
#include <string>

#include "flamegpu/util/detail/curand.cuh"
#include "flamegpu/model/AgentFunctionData.cuh"
#include "flamegpu/model/LayerData.h"
#include "flamegpu/model/AgentDescription.h"
Expand Down Expand Up @@ -667,7 +667,7 @@ void CUDASimulation::stepLayer(const std::shared_ptr<LayerData>& layer, const un
// If any condition kernel needs to be executed, do so, by checking the number of threads from before.
if (totalThreads > 0) {
// Ensure RandomManager is the correct size to accommodate all threads to be launched
curandState *d_rng = singletons->rng.resize(totalThreads, getStream(0));
util::detail::curandState *d_rng = singletons->rng.resize(totalThreads, getStream(0));
// Track which stream to use for concurrency
streamIdx = 0;
// Sum the total number of threads being launched in the layer, for rng offsetting.
Expand Down Expand Up @@ -696,7 +696,7 @@ void CUDASimulation::stepLayer(const std::shared_ptr<LayerData>& layer, const un
int gridSize = 0; // The actual grid size needed, based on input size

// Agent function condition kernel wrapper args
curandState *t_rng = d_rng + totalThreads;
util::detail::curandState *t_rng = d_rng + totalThreads;
unsigned int *scanFlag_agentDeath = this->singletons->scatter.Scan().Config(CUDAScanCompaction::Type::AGENT_DEATH, streamIdx).d_ptrs.scan_flag;
#if !defined(SEATBELTS) || SEATBELTS
auto *error_buffer = this->singletons->exception.getDevicePtr(streamIdx, this->getStream(streamIdx));
Expand Down Expand Up @@ -865,7 +865,7 @@ void CUDASimulation::stepLayer(const std::shared_ptr<LayerData>& layer, const un
// If any kernel needs to be executed, do so, by checking the number of threads from before.
if (totalThreads > 0) {
// Ensure RandomManager is the correct size to accommodate all threads to be launched
curandState *d_rng = singletons->rng.resize(totalThreads, getStream(0));
util::detail::curandState *d_rng = singletons->rng.resize(totalThreads, getStream(0));
// Total threads is now used to provide kernel launches an offset to thread-safe thread-index
totalThreads = 0;
streamIdx = 0;
Expand Down Expand Up @@ -918,7 +918,7 @@ void CUDASimulation::stepLayer(const std::shared_ptr<LayerData>& layer, const un
int gridSize = 0; // The actual grid size needed, based on input size

// Agent function kernel wrapper args
curandState * t_rng = d_rng + totalThreads;
util::detail::curandState *t_rng = d_rng + totalThreads;
unsigned int *scanFlag_agentDeath = func_des->has_agent_death ? this->singletons->scatter.Scan().Config(CUDAScanCompaction::Type::AGENT_DEATH, streamIdx).d_ptrs.scan_flag : nullptr;
unsigned int *scanFlag_messageOutput = this->singletons->scatter.Scan().Config(CUDAScanCompaction::Type::MESSAGE_OUTPUT, streamIdx).d_ptrs.scan_flag;
unsigned int *scanFlag_agentOutput = this->singletons->scatter.Scan().Config(CUDAScanCompaction::Type::AGENT_OUTPUT, streamIdx).d_ptrs.scan_flag;
Expand Down
28 changes: 14 additions & 14 deletions src/flamegpu/runtime/utility/RandomManager.cu
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
#include "flamegpu/runtime/utility/RandomManager.cuh"

#include <cuda_runtime.h>
#include <curand_kernel.h>
#include <device_launch_parameters.h>

#include<ctime>
Expand All @@ -10,6 +9,7 @@
#include <cstdio>
#include <algorithm>

#include "flamegpu/util/detail/curand.cuh"
#include "flamegpu/gpu/detail/CUDAErrorChecking.cuh"
#include "flamegpu/gpu/CUDASimulation.h"

Expand Down Expand Up @@ -81,7 +81,7 @@ void RandomManager::free() {
freeDevice();
}

curandState *RandomManager::resize(size_type _length, cudaStream_t stream) {
util::detail::curandState *RandomManager::resize(size_type _length, cudaStream_t stream) {
assert(growthModifier > 1.0);
assert(shrinkModifier > 0.0);
assert(shrinkModifier <= 1.0);
Expand All @@ -104,7 +104,7 @@ curandState *RandomManager::resize(size_type _length, cudaStream_t stream) {
resizeDeviceArray(t_length, stream);
return d_random_state;
}
__global__ void init_curand(curandState *d_random_state, unsigned int threadCount, uint64_t seed, RandomManager::size_type offset) {
__global__ void init_curand(util::detail::curandState *d_random_state, unsigned int threadCount, uint64_t seed, RandomManager::size_type offset) {
int id = blockIdx.x * blockDim.x + threadIdx.x;
if (id < threadCount)
curand_init(seed, offset + id, 0, &d_random_state[offset + id]);
Expand All @@ -114,12 +114,12 @@ void RandomManager::resizeDeviceArray(const size_type &_length, cudaStream_t str
deviceInitialised = true;
if (_length > h_max_random_size) {
// Growing array
curandState *t_hd_random_state = nullptr;
util::detail::curandState *t_hd_random_state = nullptr;
// Allocate new mem to t_hd
gpuErrchk(cudaMalloc(&t_hd_random_state, _length * sizeof(curandState)));
gpuErrchk(cudaMalloc(&t_hd_random_state, _length * sizeof(util::detail::curandState)));
// Copy hd->t_hd[**** ]
if (d_random_state) {
gpuErrchk(cudaMemcpyAsync(t_hd_random_state, d_random_state, length * sizeof(curandState), cudaMemcpyDeviceToDevice, stream));
gpuErrchk(cudaMemcpyAsync(t_hd_random_state, d_random_state, length * sizeof(util::detail::curandState), cudaMemcpyDeviceToDevice, stream));
}
// Update pointers hd=t_hd
if (d_random_state) {
Expand All @@ -131,7 +131,7 @@ void RandomManager::resizeDeviceArray(const size_type &_length, cudaStream_t str
// We have part/all host backup, copy to device array
// Reinit backup[ ** ]
const size_type copy_len = std::min(h_max_random_size, _length);
gpuErrchk(cudaMemcpyAsync(d_random_state + length, h_max_random_state + length, copy_len * sizeof(curandState), cudaMemcpyHostToDevice, stream)); // Host not pinned
gpuErrchk(cudaMemcpyAsync(d_random_state + length, h_max_random_state + length, copy_len * sizeof(util::detail::curandState), cudaMemcpyHostToDevice, stream)); // Host not pinned
length += copy_len;
}
if (_length > length) {
Expand All @@ -143,20 +143,20 @@ void RandomManager::resizeDeviceArray(const size_type &_length, cudaStream_t str
}
} else {
// Shrinking array
curandState *t_hd_random_state = nullptr;
curandState *t_h_max_random_state = nullptr;
util::detail::curandState *t_hd_random_state = nullptr;
util::detail::curandState *t_h_max_random_state = nullptr;
// Allocate new
gpuErrchk(cudaMalloc(&t_hd_random_state, _length * sizeof(curandState)));
gpuErrchk(cudaMalloc(&t_hd_random_state, _length * sizeof(util::detail::curandState)));
// Allocate host backup
if (length > h_max_random_size)
t_h_max_random_state = reinterpret_cast<curandState *>(malloc(length * sizeof(curandState)));
t_h_max_random_state = reinterpret_cast<util::detail::curandState*>(malloc(length * sizeof(util::detail::curandState)));
else
t_h_max_random_state = h_max_random_state;
// Copy old->new
assert(d_random_state);
gpuErrchk(cudaMemcpyAsync(t_hd_random_state, d_random_state, _length * sizeof(curandState), cudaMemcpyDeviceToDevice, stream));
gpuErrchk(cudaMemcpyAsync(t_hd_random_state, d_random_state, _length * sizeof(util::detail::curandState), cudaMemcpyDeviceToDevice, stream));
// Copy part being shrunk away to host storage (This could be async with above memcpy?)
gpuErrchk(cudaMemcpyAsync(t_h_max_random_state + _length, d_random_state + _length, (length - _length) * sizeof(curandState), cudaMemcpyDeviceToHost, stream));
gpuErrchk(cudaMemcpyAsync(t_h_max_random_state + _length, d_random_state + _length, (length - _length) * sizeof(util::detail::curandState), cudaMemcpyDeviceToHost, stream));
// Release and replace old host ptr
if (length > h_max_random_size) {
if (h_max_random_state)
Expand Down Expand Up @@ -196,7 +196,7 @@ RandomManager::size_type RandomManager::size() {
uint64_t RandomManager::seed() {
return mSeed;
}
curandState *RandomManager::cudaRandomState() {
util::detail::curandState *RandomManager::cudaRandomState() {
return d_random_state;
}

Expand Down
Loading