From 1d65452ef630a78ad9e74249c279145c28ca131a Mon Sep 17 00:00:00 2001
From: Robert Chisholm <robadob@robadob.org>
Date: Thu, 8 Dec 2022 16:13:28 +0000
Subject: [PATCH 1/2] Breaking Change: Files within include/src/tests were
 reorganised into a more logical structure. This includes a number of classes
 being added to the detail namespace.

---
 README.md                                     |    2 +-
 include/flamegpu/{util => detail}/Any.h       |   10 +-
 .../{util => }/detail/CUDAEventTimer.cuh      |   12 +-
 .../flamegpu/{util => }/detail/JitifyCache.h  |    8 +-
 .../{util => }/detail/SignalHandlers.h        |    8 +-
 .../flamegpu/{util => }/detail/StaticAssert.h |    8 +-
 .../{util => }/detail/SteadyClockTimer.h      |   10 +-
 include/flamegpu/{util => }/detail/Timer.h    |    8 +-
 .../{util => }/detail/compute_capability.cuh  |   10 +-
 include/flamegpu/{util => }/detail/cuda.cuh   |   14 +-
 include/flamegpu/{util => }/detail/curand.cuh |    8 +-
 .../flamegpu/{util => }/detail/cxxname.hpp    |    8 +-
 .../flamegpu/{util => detail}/type_decode.h   |   12 +-
 include/flamegpu/{util => }/detail/wddm.cuh   |    8 +-
 .../exception/FLAMEGPUDeviceException.cuh     |    6 +-
 .../flamegpu/exception/FLAMEGPUException.h    |    2 +-
 include/flamegpu/flamegpu.h                   |   18 +-
 include/flamegpu/io/JSONLogger.h              |    4 +-
 include/flamegpu/io/JSONStateReader.h         |    2 +-
 include/flamegpu/io/JSONStateWriter.h         |    2 +-
 include/flamegpu/io/StateReader.h             |    6 +-
 include/flamegpu/io/StateReaderFactory.h      |    2 +-
 include/flamegpu/io/StateWriter.h             |    9 +-
 include/flamegpu/io/StateWriterFactory.h      |    2 +-
 include/flamegpu/io/XMLLogger.h               |    4 +-
 include/flamegpu/io/XMLStateReader.h          |    2 +-
 include/flamegpu/io/XMLStateWriter.h          |    2 +-
 include/flamegpu/model/AgentDescription.h     |   18 +-
 include/flamegpu/model/EnvironmentData.h      |    8 +-
 .../flamegpu/model/EnvironmentDescription.h   |  124 +-
 include/flamegpu/model/Variable.h             |    2 +-
 include/flamegpu/pop/DeviceAgentVector.h      |   19 -
 include/flamegpu/runtime/AgentFunction.cuh    |    6 +-
 .../runtime/AgentFunctionCondition.cuh        |    4 +-
 include/flamegpu/runtime/DeviceAPI.cuh        |   12 +-
 include/flamegpu/runtime/HostAPI.h            |   38 +-
 include/flamegpu/runtime/HostAgentAPI.cuh     |   16 +-
 .../{pop => runtime/agent}/AgentInstance.h    |   76 +-
 .../runtime/agent/DeviceAgentVector.h         |   19 +
 .../agent}/DeviceAgentVector_impl.h           |   26 +-
 .../flamegpu/runtime/agent/HostAgentAPI.cuh   | 1017 +++++++++++++++++
 .../runtime/{ => agent}/HostNewAgentAPI.h     |   66 +-
 .../runtime/detail/curve/DeviceCurve.cuh      |   16 +-
 .../runtime/detail/curve/HostCurve.cuh        |    4 +-
 .../DeviceEnvironment.cuh                     |   10 +-
 .../DeviceMacroProperty.cuh                   |    6 +-
 .../HostEnvironment.cuh                       |   20 +-
 .../HostMacroProperty.cuh                     |    6 +-
 .../messaging/MessageArray/MessageArrayHost.h |   10 +-
 .../MessageArray2D/MessageArray2DHost.h       |   10 +-
 .../MessageArray3D/MessageArray3DHost.h       |   10 +-
 .../MessageBruteForce/MessageBruteForceHost.h |   22 +-
 .../MessageBucket/MessageBucketHost.h         |   10 +-
 .../messaging/MessageNone/MessageNoneHost.h   |    7 +-
 .../MessageSpatial2D/MessageSpatial2DHost.h   |   10 +-
 .../MessageSpatial3D/MessageSpatial3DHost.h   |   12 +-
 .../messaging/MessageSpecialisationHandler.h  |    7 +-
 .../{utility => random}/AgentRandom.cuh       |   18 +-
 .../{utility => random}/HostRandom.cuh        |   22 +-
 .../{sim => simulation}/AgentLoggingConfig.h  |   38 +-
 .../AgentLoggingConfig_Reductions.cuh         |    6 +-
 .../AgentLoggingConfig_SumReturn.h            |    6 +-
 .../{pop => simulation}/AgentVector.h         |   16 +-
 .../{pop => simulation}/AgentVector_Agent.h   |   90 +-
 .../{gpu => simulation}/CUDAEnsemble.h        |    6 +-
 .../{gpu => simulation}/CUDASimulation.h      |   53 +-
 .../flamegpu/{sim => simulation}/LogFrame.h   |   34 +-
 .../{sim => simulation}/LoggingConfig.h       |   12 +-
 .../flamegpu/{sim => simulation}/RunPlan.h    |   82 +-
 .../{sim => simulation}/RunPlanVector.h       |   58 +-
 .../flamegpu/{sim => simulation}/Simulation.h |   21 +-
 .../detail}/AgentInterface.h                  |    8 +-
 .../{gpu => simulation/detail}/CUDAAgent.h    |   36 +-
 .../detail}/CUDAAgentStateList.h              |   23 +-
 .../detail/CUDAErrorChecking.cuh              |    6 +-
 .../{gpu => simulation/detail}/CUDAFatAgent.h |   20 +-
 .../detail}/CUDAFatAgentStateList.h           |   24 +-
 .../detail}/CUDAMacroEnvironment.h            |   24 +-
 .../{gpu => simulation/detail}/CUDAMessage.h  |   25 +-
 .../detail}/CUDAMessageList.h                 |   14 +-
 .../detail}/CUDAScanCompaction.h              |    9 +-
 .../detail}/CUDAScatter.cuh                   |   13 +-
 .../detail/CubTemporaryMemory.cuh             |    6 +-
 .../detail}/EnvironmentManager.cuh            |   44 +-
 .../detail/GenericMemoryVector.h              |    6 +-
 .../{pop => simulation}/detail/MemoryVector.h |    8 +-
 .../detail}/RandomManager.cuh                 |   22 +-
 .../{sim => simulation/detail}/SimLogger.h    |   14 +-
 .../{sim => simulation/detail}/SimRunner.h    |   14 +-
 include/flamegpu/visualiser/AgentVis.h        |    6 +-
 src/CMakeLists.txt                            |  176 +--
 src/flamegpu/{util => }/detail/JitifyCache.cu |   20 +-
 .../{util => }/detail/compute_capability.cu   |    6 +-
 src/flamegpu/{util => }/detail/wddm.cu        |    6 +-
 .../exception/FLAMEGPUDeviceException.cu      |   16 +-
 src/flamegpu/io/JSONLogger.cu                 |    6 +-
 src/flamegpu/io/JSONStateReader.cpp           |   12 +-
 src/flamegpu/io/JSONStateWriter.cpp           |    8 +-
 src/flamegpu/io/XMLLogger.cu                  |    6 +-
 src/flamegpu/io/XMLStateReader.cpp            |    8 +-
 src/flamegpu/io/XMLStateWriter.cpp            |    8 +-
 src/flamegpu/model/AgentFunctionData.cpp      |   14 +-
 .../model/AgentFunctionDescription.cpp        |   18 +-
 src/flamegpu/model/EnvironmentData.cpp        |    2 +-
 src/flamegpu/model/EnvironmentDescription.cpp |    2 +-
 src/flamegpu/runtime/HostAPI.cu               |   18 +-
 .../agent}/DeviceAgentVector_impl.cu          |   12 +-
 .../runtime/{ => agent}/HostAgentAPI.cu       |    6 +-
 .../runtime/detail/curve/HostCurve.cu         |    6 +-
 .../runtime/detail/curve/curve_rtc.cpp        |   84 +-
 .../HostEnvironment.cu                        |    4 +-
 .../runtime/messaging/MessageArray.cu         |   20 +-
 .../runtime/messaging/MessageArray2D.cu       |   20 +-
 .../runtime/messaging/MessageArray3D.cu       |   20 +-
 .../runtime/messaging/MessageBruteForce.cu    |   12 +-
 .../runtime/messaging/MessageBucket.cu        |   32 +-
 .../runtime/messaging/MessageSpatial2D.cu     |   36 +-
 .../runtime/messaging/MessageSpatial3D.cu     |   32 +-
 .../runtime/{utility => random}/HostRandom.cu |    2 +-
 .../{pop => simulation}/AgentInstance.cpp     |   10 +-
 .../{sim => simulation}/AgentLoggingConfig.cu |    2 +-
 .../{pop => simulation}/AgentVector.cpp       |    4 +-
 .../{pop => simulation}/AgentVector_Agent.cpp |    2 +-
 .../{gpu => simulation}/CUDAEnsemble.cu       |   38 +-
 .../{gpu => simulation}/CUDASimulation.cu     |  160 +--
 src/flamegpu/{sim => simulation}/LogFrame.cu  |   16 +-
 .../{sim => simulation}/LoggingConfig.cu      |    4 +-
 src/flamegpu/{sim => simulation}/RunPlan.cpp  |    6 +-
 .../{sim => simulation}/RunPlanVector.cpp     |    2 +-
 .../{sim => simulation}/Simulation.cu         |    6 +-
 .../{gpu => simulation/detail}/CUDAAgent.cu   |   48 +-
 .../detail}/CUDAAgentStateList.cu             |   22 +-
 .../detail}/CUDAFatAgent.cu                   |   22 +-
 .../detail}/CUDAFatAgentStateList.cu          |   26 +-
 .../detail}/CUDAMacroEnvironment.cu           |   10 +-
 .../{gpu => simulation/detail}/CUDAMessage.cu |   20 +-
 .../detail}/CUDAMessageList.cu                |   20 +-
 .../detail}/CUDAScanCompaction.cu             |   14 +-
 .../{gpu => simulation/detail}/CUDAScatter.cu |   14 +-
 .../detail/CubTemporaryMemory.cu              |   10 +-
 .../detail}/EnvironmentManager.cu             |   14 +-
 .../detail}/RandomManager.cu                  |   44 +-
 .../{sim => simulation/detail}/SimLogger.cu   |    6 +-
 .../{sim => simulation/detail}/SimRunner.cu   |    8 +-
 src/flamegpu/util/cleanup.cu                  |    4 +-
 src/flamegpu/visualiser/AgentVis.cpp          |    4 +-
 src/flamegpu/visualiser/ModelVis.cpp          |    2 +-
 swig/python/flamegpu.i                        |   42 +-
 tests/CMakeLists.txt                          |   92 +-
 tests/helpers/main.cu                         |    2 +-
 .../detail}/test_agent_state_transition.py    |    0
 .../{ => agent}/host_reduction/test_count.py  |    0
 .../{ => agent}/host_reduction/test_max.py    |    0
 .../host_reduction/test_mean_standarddev.py   |    0
 .../{ => agent}/host_reduction/test_min.py    |    0
 .../{ => agent}/host_reduction/test_sum.py    |    0
 .../{ => agent}/test_device_agent_creation.py |    0
 .../agent}/test_device_agent_vector.py        |    0
 .../{ => agent}/test_host_agent_creation.py   |    0
 .../{ => agent}/test_host_agent_sort.py       |    0
 .../test_agent_environment.py                 |    0
 .../test_device_macro_property.py             |    0
 .../test_environment_manager.py               |    0
 .../test_host_environment.py                  |    0
 .../test_host_macro_property.py               |    0
 .../runtime/{ => random}/test_agent_random.py |    0
 .../runtime/{ => random}/test_host_random.py  |    0
 .../detail}/test_cuda_subagent.py             |    0
 .../detail}/test_cuda_submacroenvironment.py  |    0
 .../{sim => simulation}/test_RunPlan.py       |    0
 .../{sim => simulation}/test_RunPlanVector.py |    0
 .../test_agent_instance.py                    |    0
 .../{pop => simulation}/test_agent_vector.py  |    0
 .../{gpu => simulation}/test_cuda_ensemble.py |    0
 .../test_cuda_simulation.py                   |    0
 .../test_gpu_validation.py                    |    0
 .../{util => detail}/test_CUDAEventTimer.cu   |   12 +-
 .../test_SteadyClockTimer.cpp                 |    8 +-
 .../test_compute_capability.cu                |   52 +-
 .../test_cases/{util => detail}/test_cuda.cu  |   24 +-
 tests/test_cases/detail/test_cxxname.cpp      |   24 +
 .../test_dependency_versions.cu               |    0
 .../test_multi_thread_device.cu               |   16 +-
 .../test_rtc_multi_thread_device.cu           |   14 +-
 .../test_cases/{util => detail}/test_wddm.cu  |   12 +-
 .../model/test_environment_description.cu     |    8 +-
 .../detail}/test_agent_state_transition.cu    |    0
 .../detail}/test_spatial_agent_sort.cu        |    0
 .../{ => agent}/host_reduction/test_count.cu  |    0
 .../host_reduction/test_histogram_even.cu     |    0
 .../{ => agent}/host_reduction/test_max.cu    |    0
 .../test_mean_standarddeviation.cu            |    0
 .../{ => agent}/host_reduction/test_min.cu    |    0
 .../{ => agent}/host_reduction/test_misc.cu   |    0
 .../{ => agent}/host_reduction/test_reduce.cu |    0
 .../{ => agent}/host_reduction/test_sum.cu    |    0
 .../host_reduction/test_transform_reduce.cu   |    0
 .../{ => agent}/test_device_agent_creation.cu |    0
 .../agent}/test_device_agent_vector.cu        |    0
 .../{ => agent}/test_host_agent_creation.cu   |    0
 .../{ => agent}/test_host_agent_sort.cu       |    0
 .../test_device_environment.cu                |    0
 .../test_device_macro_property.cu             |    0
 .../test_environment_manager.cu               |    0
 .../test_host_environment.cu                  |    2 +-
 .../test_host_macro_property.cu               |    0
 .../test_subenvironment_manager.cu            |    0
 .../runtime/{ => random}/test_agent_random.cu |    0
 .../runtime/{ => random}/test_host_random.cu  |    0
 .../detail}/test_cuda_subagent.cu             |    0
 .../detail}/test_cuda_submacroenvironment.cu  |    0
 .../{sim => simulation}/test_RunPlan.cu       |    0
 .../{sim => simulation}/test_RunPlanVector.cu |    0
 .../test_agent_instance.cu                    |    0
 .../{pop => simulation}/test_agent_vector.cu  |    0
 .../{gpu => simulation}/test_cuda_ensemble.cu |    0
 .../test_cuda_simulation.cu                   |    6 +-
 .../test_cuda_simulation_concurrency.cu       |    2 +-
 .../test_gpu_validation.cu                    |    0
 .../test_host_functions.cu                    |    0
 tests/test_cases/util/test_cleanup.cu         |    2 +-
 tests/test_cases/util/test_cxxname.cpp        |   24 -
 tests/test_cases/util/test_nvtx.cu            |    2 +-
 223 files changed, 2565 insertions(+), 1511 deletions(-)
 rename include/flamegpu/{util => detail}/Any.h (93%)
 rename include/flamegpu/{util => }/detail/CUDAEventTimer.cuh (93%)
 rename include/flamegpu/{util => }/detail/JitifyCache.h (96%)
 rename include/flamegpu/{util => }/detail/SignalHandlers.h (80%)
 rename include/flamegpu/{util => }/detail/StaticAssert.h (90%)
 rename include/flamegpu/{util => }/detail/SteadyClockTimer.h (90%)
 rename include/flamegpu/{util => }/detail/Timer.h (78%)
 rename include/flamegpu/{util => }/detail/compute_capability.cuh (91%)
 rename include/flamegpu/{util => }/detail/cuda.cuh (91%)
 rename include/flamegpu/{util => }/detail/curand.cuh (71%)
 rename include/flamegpu/{util => }/detail/cxxname.hpp (81%)
 rename include/flamegpu/{util => detail}/type_decode.h (78%)
 rename include/flamegpu/{util => }/detail/wddm.cuh (80%)
 delete mode 100644 include/flamegpu/pop/DeviceAgentVector.h
 rename include/flamegpu/{pop => runtime/agent}/AgentInstance.h (80%)
 create mode 100644 include/flamegpu/runtime/agent/DeviceAgentVector.h
 rename include/flamegpu/{pop => runtime/agent}/DeviceAgentVector_impl.h (94%)
 create mode 100644 include/flamegpu/runtime/agent/HostAgentAPI.cuh
 rename include/flamegpu/runtime/{ => agent}/HostNewAgentAPI.h (85%)
 rename include/flamegpu/runtime/{utility => environment}/DeviceEnvironment.cuh (95%)
 rename include/flamegpu/runtime/{utility => environment}/DeviceMacroProperty.cuh (99%)
 rename include/flamegpu/runtime/{utility => environment}/HostEnvironment.cuh (93%)
 rename include/flamegpu/runtime/{utility => environment}/HostMacroProperty.cuh (99%)
 rename include/flamegpu/runtime/{utility => random}/AgentRandom.cuh (88%)
 rename include/flamegpu/runtime/{utility => random}/HostRandom.cuh (80%)
 rename include/flamegpu/{sim => simulation}/AgentLoggingConfig.h (84%)
 rename include/flamegpu/{sim => simulation}/AgentLoggingConfig_Reductions.cuh (89%)
 rename include/flamegpu/{sim => simulation}/AgentLoggingConfig_SumReturn.h (88%)
 rename include/flamegpu/{pop => simulation}/AgentVector.h (98%)
 rename include/flamegpu/{pop => simulation}/AgentVector_Agent.h (84%)
 rename include/flamegpu/{gpu => simulation}/CUDAEnsemble.h (97%)
 rename include/flamegpu/{gpu => simulation}/CUDASimulation.h (94%)
 rename include/flamegpu/{sim => simulation}/LogFrame.h (94%)
 rename include/flamegpu/{sim => simulation}/LoggingConfig.h (94%)
 rename include/flamegpu/{sim => simulation}/RunPlan.h (87%)
 rename include/flamegpu/{sim => simulation}/RunPlanVector.h (93%)
 rename include/flamegpu/{sim => simulation}/Simulation.h (93%)
 rename include/flamegpu/{sim => simulation/detail}/AgentInterface.h (90%)
 rename include/flamegpu/{gpu => simulation/detail}/CUDAAgent.h (94%)
 rename include/flamegpu/{gpu => simulation/detail}/CUDAAgentStateList.h (91%)
 rename include/flamegpu/{gpu => simulation}/detail/CUDAErrorChecking.cuh (93%)
 rename include/flamegpu/{gpu => simulation/detail}/CUDAFatAgent.h (94%)
 rename include/flamegpu/{gpu => simulation/detail}/CUDAFatAgentStateList.h (93%)
 rename include/flamegpu/{gpu => simulation/detail}/CUDAMacroEnvironment.h (95%)
 rename include/flamegpu/{gpu => simulation/detail}/CUDAMessage.h (91%)
 rename include/flamegpu/{gpu => simulation/detail}/CUDAMessageList.h (88%)
 rename include/flamegpu/{gpu => simulation/detail}/CUDAScanCompaction.h (95%)
 rename include/flamegpu/{gpu => simulation/detail}/CUDAScatter.cuh (97%)
 rename include/flamegpu/{gpu => simulation}/detail/CubTemporaryMemory.cuh (77%)
 rename include/flamegpu/{runtime/utility => simulation/detail}/EnvironmentManager.cuh (92%)
 rename include/flamegpu/{pop => simulation}/detail/GenericMemoryVector.h (92%)
 rename include/flamegpu/{pop => simulation}/detail/MemoryVector.h (93%)
 rename include/flamegpu/{runtime/utility => simulation/detail}/RandomManager.cuh (90%)
 rename include/flamegpu/{sim => simulation/detail}/SimLogger.h (90%)
 rename include/flamegpu/{sim => simulation/detail}/SimRunner.h (94%)
 rename src/flamegpu/{util => }/detail/JitifyCache.cu (97%)
 rename src/flamegpu/{util => }/detail/compute_capability.cu (97%)
 rename src/flamegpu/{util => }/detail/wddm.cu (90%)
 rename src/flamegpu/{pop => runtime/agent}/DeviceAgentVector_impl.cu (97%)
 rename src/flamegpu/runtime/{ => agent}/HostAgentAPI.cu (90%)
 rename src/flamegpu/runtime/{utility => environment}/HostEnvironment.cu (55%)
 rename src/flamegpu/runtime/{utility => random}/HostRandom.cu (80%)
 rename src/flamegpu/{pop => simulation}/AgentInstance.cpp (83%)
 rename src/flamegpu/{sim => simulation}/AgentLoggingConfig.cu (97%)
 rename src/flamegpu/{pop => simulation}/AgentVector.cpp (99%)
 rename src/flamegpu/{pop => simulation}/AgentVector_Agent.cpp (97%)
 rename src/flamegpu/{gpu => simulation}/CUDAEnsemble.cu (94%)
 rename src/flamegpu/{gpu => simulation}/CUDASimulation.cu (92%)
 rename src/flamegpu/{sim => simulation}/LogFrame.cu (83%)
 rename src/flamegpu/{sim => simulation}/LoggingConfig.cu (96%)
 rename src/flamegpu/{sim => simulation}/RunPlan.cpp (95%)
 rename src/flamegpu/{sim => simulation}/RunPlanVector.cpp (99%)
 rename src/flamegpu/{sim => simulation}/Simulation.cu (99%)
 rename src/flamegpu/{gpu => simulation/detail}/CUDAAgent.cu (94%)
 rename src/flamegpu/{gpu => simulation/detail}/CUDAAgentStateList.cu (93%)
 rename src/flamegpu/{gpu => simulation/detail}/CUDAFatAgent.cu (94%)
 rename src/flamegpu/{gpu => simulation/detail}/CUDAFatAgentStateList.cu (91%)
 rename src/flamegpu/{gpu => simulation/detail}/CUDAMacroEnvironment.cu (96%)
 rename src/flamegpu/{gpu => simulation/detail}/CUDAMessage.cu (91%)
 rename src/flamegpu/{gpu => simulation/detail}/CUDAMessageList.cu (89%)
 rename src/flamegpu/{gpu => simulation/detail}/CUDAScanCompaction.cu (83%)
 rename src/flamegpu/{gpu => simulation/detail}/CUDAScatter.cu (98%)
 rename src/flamegpu/{gpu => simulation}/detail/CubTemporaryMemory.cu (71%)
 rename src/flamegpu/{runtime/utility => simulation/detail}/EnvironmentManager.cu (95%)
 rename src/flamegpu/{runtime/utility => simulation/detail}/RandomManager.cu (78%)
 rename src/flamegpu/{sim => simulation/detail}/SimLogger.cu (96%)
 rename src/flamegpu/{sim => simulation/detail}/SimRunner.cu (96%)
 rename tests/swig/python/runtime/{ => agent/detail}/test_agent_state_transition.py (100%)
 rename tests/swig/python/runtime/{ => agent}/host_reduction/test_count.py (100%)
 rename tests/swig/python/runtime/{ => agent}/host_reduction/test_max.py (100%)
 rename tests/swig/python/runtime/{ => agent}/host_reduction/test_mean_standarddev.py (100%)
 rename tests/swig/python/runtime/{ => agent}/host_reduction/test_min.py (100%)
 rename tests/swig/python/runtime/{ => agent}/host_reduction/test_sum.py (100%)
 rename tests/swig/python/runtime/{ => agent}/test_device_agent_creation.py (100%)
 rename tests/swig/python/{pop => runtime/agent}/test_device_agent_vector.py (100%)
 rename tests/swig/python/runtime/{ => agent}/test_host_agent_creation.py (100%)
 rename tests/swig/python/runtime/{ => agent}/test_host_agent_sort.py (100%)
 rename tests/swig/python/runtime/{ => environment}/test_agent_environment.py (100%)
 rename tests/swig/python/runtime/{ => environment}/test_device_macro_property.py (100%)
 rename tests/swig/python/runtime/{ => environment}/test_environment_manager.py (100%)
 rename tests/swig/python/runtime/{ => environment}/test_host_environment.py (100%)
 rename tests/swig/python/runtime/{ => environment}/test_host_macro_property.py (100%)
 rename tests/swig/python/runtime/{ => random}/test_agent_random.py (100%)
 rename tests/swig/python/runtime/{ => random}/test_host_random.py (100%)
 rename tests/swig/python/{gpu => simulation/detail}/test_cuda_subagent.py (100%)
 rename tests/swig/python/{gpu => simulation/detail}/test_cuda_submacroenvironment.py (100%)
 rename tests/swig/python/{sim => simulation}/test_RunPlan.py (100%)
 rename tests/swig/python/{sim => simulation}/test_RunPlanVector.py (100%)
 rename tests/swig/python/{pop => simulation}/test_agent_instance.py (100%)
 rename tests/swig/python/{pop => simulation}/test_agent_vector.py (100%)
 rename tests/swig/python/{gpu => simulation}/test_cuda_ensemble.py (100%)
 rename tests/swig/python/{gpu => simulation}/test_cuda_simulation.py (100%)
 rename tests/swig/python/{gpu => simulation}/test_gpu_validation.py (100%)
 rename tests/test_cases/{util => detail}/test_CUDAEventTimer.cu (88%)
 rename tests/test_cases/{util => detail}/test_SteadyClockTimer.cpp (85%)
 rename tests/test_cases/{util => detail}/test_compute_capability.cu (61%)
 rename tests/test_cases/{util => detail}/test_cuda.cu (86%)
 create mode 100644 tests/test_cases/detail/test_cxxname.cpp
 rename tests/test_cases/{util => detail}/test_dependency_versions.cu (100%)
 rename tests/test_cases/{util => detail}/test_multi_thread_device.cu (98%)
 rename tests/test_cases/{util => detail}/test_rtc_multi_thread_device.cu (97%)
 rename tests/test_cases/{util => detail}/test_wddm.cu (82%)
 rename tests/test_cases/runtime/{ => agent/detail}/test_agent_state_transition.cu (100%)
 rename tests/test_cases/runtime/{ => agent/detail}/test_spatial_agent_sort.cu (100%)
 rename tests/test_cases/runtime/{ => agent}/host_reduction/test_count.cu (100%)
 rename tests/test_cases/runtime/{ => agent}/host_reduction/test_histogram_even.cu (100%)
 rename tests/test_cases/runtime/{ => agent}/host_reduction/test_max.cu (100%)
 rename tests/test_cases/runtime/{ => agent}/host_reduction/test_mean_standarddeviation.cu (100%)
 rename tests/test_cases/runtime/{ => agent}/host_reduction/test_min.cu (100%)
 rename tests/test_cases/runtime/{ => agent}/host_reduction/test_misc.cu (100%)
 rename tests/test_cases/runtime/{ => agent}/host_reduction/test_reduce.cu (100%)
 rename tests/test_cases/runtime/{ => agent}/host_reduction/test_sum.cu (100%)
 rename tests/test_cases/runtime/{ => agent}/host_reduction/test_transform_reduce.cu (100%)
 rename tests/test_cases/runtime/{ => agent}/test_device_agent_creation.cu (100%)
 rename tests/test_cases/{pop => runtime/agent}/test_device_agent_vector.cu (100%)
 rename tests/test_cases/runtime/{ => agent}/test_host_agent_creation.cu (100%)
 rename tests/test_cases/runtime/{ => agent}/test_host_agent_sort.cu (100%)
 rename tests/test_cases/runtime/{ => environment}/test_device_environment.cu (100%)
 rename tests/test_cases/runtime/{ => environment}/test_device_macro_property.cu (100%)
 rename tests/test_cases/runtime/{ => environment}/test_environment_manager.cu (100%)
 rename tests/test_cases/runtime/{ => environment}/test_host_environment.cu (99%)
 rename tests/test_cases/runtime/{ => environment}/test_host_macro_property.cu (100%)
 rename tests/test_cases/runtime/{ => environment}/test_subenvironment_manager.cu (100%)
 rename tests/test_cases/runtime/{ => random}/test_agent_random.cu (100%)
 rename tests/test_cases/runtime/{ => random}/test_host_random.cu (100%)
 rename tests/test_cases/{gpu => simulation/detail}/test_cuda_subagent.cu (100%)
 rename tests/test_cases/{gpu => simulation/detail}/test_cuda_submacroenvironment.cu (100%)
 rename tests/test_cases/{sim => simulation}/test_RunPlan.cu (100%)
 rename tests/test_cases/{sim => simulation}/test_RunPlanVector.cu (100%)
 rename tests/test_cases/{pop => simulation}/test_agent_instance.cu (100%)
 rename tests/test_cases/{pop => simulation}/test_agent_vector.cu (100%)
 rename tests/test_cases/{gpu => simulation}/test_cuda_ensemble.cu (100%)
 rename tests/test_cases/{gpu => simulation}/test_cuda_simulation.cu (99%)
 rename tests/test_cases/{gpu => simulation}/test_cuda_simulation_concurrency.cu (99%)
 rename tests/test_cases/{gpu => simulation}/test_gpu_validation.cu (100%)
 rename tests/test_cases/{sim => simulation}/test_host_functions.cu (100%)
 delete mode 100644 tests/test_cases/util/test_cxxname.cpp

diff --git a/README.md b/README.md
index 1770dc764..ebf36f949 100644
--- a/README.md
+++ b/README.md
@@ -362,4 +362,4 @@ For a full list of known issues pleases see the [Issue Tracker](https://github.c
 + Multiple known areas where performance can be improved (e.g. [#449](/~https://github.com/FLAMEGPU/FLAMEGPU2/issues/449), [#402](/~https://github.com/FLAMEGPU/FLAMEGPU2/issues/402))
 + Windows/MSVC builds using CUDA 11.0 may encounter errors when performing incremental builds if the static library has been recompiled. If this presents itself, re-save any `.cu` file in your executable producing project and re-trigger the build.
 + Debug builds under linux with CUDA 11.0 may encounter cuda errors during `validateIDCollisions`. Consider using an alternate CUDA version if this is required ([#569](/~https://github.com/FLAMEGPU/FLAMEGPU2/issues/569)).
-+ CUDA 11.0 with GCC 9 may encounter a segmentation fault during compilation of the test suite. Consider using GCC 8 with CUDA 11.0.
++ CUDA 11.0 with GCC 9 may encounter a segmentation fault during compilation of the test suite. Consider using GCC 8 with CUDA 11.0.
\ No newline at end of file
diff --git a/include/flamegpu/util/Any.h b/include/flamegpu/detail/Any.h
similarity index 93%
rename from include/flamegpu/util/Any.h
rename to include/flamegpu/detail/Any.h
index 90a9a4e96..00587de53 100644
--- a/include/flamegpu/util/Any.h
+++ b/include/flamegpu/detail/Any.h
@@ -1,10 +1,10 @@
-#ifndef INCLUDE_FLAMEGPU_UTIL_ANY_H_
-#define INCLUDE_FLAMEGPU_UTIL_ANY_H_
+#ifndef INCLUDE_FLAMEGPU_DETAIL_ANY_H_
+#define INCLUDE_FLAMEGPU_DETAIL_ANY_H_
 
 #include <cstring>
 
 namespace flamegpu {
-namespace util {
+namespace detail {
 
 /**
  * Minimal std::any replacement, works pre c++17
@@ -88,7 +88,7 @@ struct Any {
     const unsigned int elements;
 };
 
-}  // namespace util
+}  // namespace detail
 }  // namespace flamegpu
 
-#endif  // INCLUDE_FLAMEGPU_UTIL_ANY_H_
+#endif  // INCLUDE_FLAMEGPU_DETAIL_ANY_H_
diff --git a/include/flamegpu/util/detail/CUDAEventTimer.cuh b/include/flamegpu/detail/CUDAEventTimer.cuh
similarity index 93%
rename from include/flamegpu/util/detail/CUDAEventTimer.cuh
rename to include/flamegpu/detail/CUDAEventTimer.cuh
index fbd13c559..32b4425b4 100644
--- a/include/flamegpu/util/detail/CUDAEventTimer.cuh
+++ b/include/flamegpu/detail/CUDAEventTimer.cuh
@@ -1,14 +1,13 @@
-#ifndef INCLUDE_FLAMEGPU_UTIL_DETAIL_CUDAEVENTTIMER_CUH_
-#define INCLUDE_FLAMEGPU_UTIL_DETAIL_CUDAEVENTTIMER_CUH_
+#ifndef INCLUDE_FLAMEGPU_DETAIL_CUDAEVENTTIMER_CUH_
+#define INCLUDE_FLAMEGPU_DETAIL_CUDAEVENTTIMER_CUH_
 
 #include <cuda_runtime.h>
 
-#include "flamegpu/util/detail/Timer.h"
+#include "flamegpu/detail/Timer.h"
 #include "flamegpu/exception/FLAMEGPUException.h"
-#include "flamegpu/gpu/detail/CUDAErrorChecking.cuh"
+#include "flamegpu/simulation/detail/CUDAErrorChecking.cuh"
 
 namespace flamegpu {
-namespace util {
 namespace detail {
 
 /**
@@ -130,7 +129,6 @@ class CUDAEventTimer : public virtual Timer {
 };
 
 }  // namespace detail
-}  // namespace util
 }  // namespace flamegpu
 
-#endif  // INCLUDE_FLAMEGPU_UTIL_DETAIL_CUDAEVENTTIMER_CUH_
+#endif  // INCLUDE_FLAMEGPU_DETAIL_CUDAEVENTTIMER_CUH_
diff --git a/include/flamegpu/util/detail/JitifyCache.h b/include/flamegpu/detail/JitifyCache.h
similarity index 96%
rename from include/flamegpu/util/detail/JitifyCache.h
rename to include/flamegpu/detail/JitifyCache.h
index 798529411..12bcb4ddc 100644
--- a/include/flamegpu/util/detail/JitifyCache.h
+++ b/include/flamegpu/detail/JitifyCache.h
@@ -1,5 +1,5 @@
-#ifndef INCLUDE_FLAMEGPU_UTIL_DETAIL_JITIFYCACHE_H_
-#define INCLUDE_FLAMEGPU_UTIL_DETAIL_JITIFYCACHE_H_
+#ifndef INCLUDE_FLAMEGPU_DETAIL_JITIFYCACHE_H_
+#define INCLUDE_FLAMEGPU_DETAIL_JITIFYCACHE_H_
 
 #include <map>
 #include <mutex>
@@ -18,7 +18,6 @@
 using jitify::experimental::KernelInstantiation;
 
 namespace flamegpu {
-namespace util {
 namespace detail {
 
 /**
@@ -145,7 +144,6 @@ class JitifyCache {
 };
 
 }  // namespace detail
-}  // namespace util
 }  // namespace flamegpu
 
-#endif  // INCLUDE_FLAMEGPU_UTIL_DETAIL_JITIFYCACHE_H_
+#endif  // INCLUDE_FLAMEGPU_DETAIL_JITIFYCACHE_H_
diff --git a/include/flamegpu/util/detail/SignalHandlers.h b/include/flamegpu/detail/SignalHandlers.h
similarity index 80%
rename from include/flamegpu/util/detail/SignalHandlers.h
rename to include/flamegpu/detail/SignalHandlers.h
index 2a6c8ec0b..82dc17b87 100644
--- a/include/flamegpu/util/detail/SignalHandlers.h
+++ b/include/flamegpu/detail/SignalHandlers.h
@@ -1,10 +1,9 @@
-#ifndef INCLUDE_FLAMEGPU_UTIL_DETAIL_SIGNALHANDLERS_H_
-#define INCLUDE_FLAMEGPU_UTIL_DETAIL_SIGNALHANDLERS_H_
+#ifndef INCLUDE_FLAMEGPU_DETAIL_SIGNALHANDLERS_H_
+#define INCLUDE_FLAMEGPU_DETAIL_SIGNALHANDLERS_H_
 #include <cstdlib>
 #include <csignal>
 
 namespace flamegpu {
-namespace util {
 namespace detail {
 /**
  * Signal handlers used to try and produce a clean exit on interrupt
@@ -34,7 +33,6 @@ static void registerSignalHandlers(){
 };
 
 }  // namespace detail
-}  // namespace util
 }  // namespace flamegpu
 
-#endif  // INCLUDE_FLAMEGPU_UTIL_DETAIL_SIGNALHANDLERS_H_
+#endif  // INCLUDE_FLAMEGPU_DETAIL_SIGNALHANDLERS_H_
diff --git a/include/flamegpu/util/detail/StaticAssert.h b/include/flamegpu/detail/StaticAssert.h
similarity index 90%
rename from include/flamegpu/util/detail/StaticAssert.h
rename to include/flamegpu/detail/StaticAssert.h
index 806f4071d..1edd4bdf1 100644
--- a/include/flamegpu/util/detail/StaticAssert.h
+++ b/include/flamegpu/detail/StaticAssert.h
@@ -1,10 +1,9 @@
-#ifndef INCLUDE_FLAMEGPU_UTIL_DETAIL_STATICASSERT_H_
-#define INCLUDE_FLAMEGPU_UTIL_DETAIL_STATICASSERT_H_
+#ifndef INCLUDE_FLAMEGPU_DETAIL_STATICASSERT_H_
+#define INCLUDE_FLAMEGPU_DETAIL_STATICASSERT_H_
 
 #include <cstdint>
 
 namespace flamegpu {
-namespace util {
 namespace detail {
 /**
  * These are taken from MSVCs std to allow us to perform static assertions
@@ -75,7 +74,6 @@ struct _Is_IntType
 };
 }  // namespace StaticAssert
 }  // namespace detail
-}  // namespace util
 }  // namespace flamegpu
 
-#endif  // INCLUDE_FLAMEGPU_UTIL_DETAIL_STATICASSERT_H_
+#endif  // INCLUDE_FLAMEGPU_DETAIL_STATICASSERT_H_
diff --git a/include/flamegpu/util/detail/SteadyClockTimer.h b/include/flamegpu/detail/SteadyClockTimer.h
similarity index 90%
rename from include/flamegpu/util/detail/SteadyClockTimer.h
rename to include/flamegpu/detail/SteadyClockTimer.h
index fa779fe2b..9a2e0168c 100644
--- a/include/flamegpu/util/detail/SteadyClockTimer.h
+++ b/include/flamegpu/detail/SteadyClockTimer.h
@@ -1,14 +1,13 @@
-#ifndef INCLUDE_FLAMEGPU_UTIL_DETAIL_STEADYCLOCKTIMER_H_
-#define INCLUDE_FLAMEGPU_UTIL_DETAIL_STEADYCLOCKTIMER_H_
+#ifndef INCLUDE_FLAMEGPU_DETAIL_STEADYCLOCKTIMER_H_
+#define INCLUDE_FLAMEGPU_DETAIL_STEADYCLOCKTIMER_H_
 
 #include <chrono>
 
-#include "flamegpu/util/detail/Timer.h"
+#include "flamegpu/detail/Timer.h"
 #include "flamegpu/exception/FLAMEGPUException.h"
 
 
 namespace flamegpu {
-namespace util {
 namespace detail {
 
 /** 
@@ -91,7 +90,6 @@ class SteadyClockTimer : public virtual Timer {
 };
 
 }  // namespace detail
-}  // namespace util
 }  // namespace flamegpu
 
-#endif  // INCLUDE_FLAMEGPU_UTIL_DETAIL_STEADYCLOCKTIMER_H_
+#endif  // INCLUDE_FLAMEGPU_DETAIL_STEADYCLOCKTIMER_H_
diff --git a/include/flamegpu/util/detail/Timer.h b/include/flamegpu/detail/Timer.h
similarity index 78%
rename from include/flamegpu/util/detail/Timer.h
rename to include/flamegpu/detail/Timer.h
index 6f6f6d36d..c21c95234 100644
--- a/include/flamegpu/util/detail/Timer.h
+++ b/include/flamegpu/detail/Timer.h
@@ -1,9 +1,8 @@
-#ifndef INCLUDE_FLAMEGPU_UTIL_DETAIL_TIMER_H_
-#define INCLUDE_FLAMEGPU_UTIL_DETAIL_TIMER_H_
+#ifndef INCLUDE_FLAMEGPU_DETAIL_TIMER_H_
+#define INCLUDE_FLAMEGPU_DETAIL_TIMER_H_
 
 
 namespace flamegpu {
-namespace util {
 namespace detail {
 
 /** 
@@ -38,7 +37,6 @@ class Timer {
 };
 
 }  // namespace detail
-}  // namespace util
 }  // namespace flamegpu
 
-#endif  // INCLUDE_FLAMEGPU_UTIL_DETAIL_TIMER_H_
+#endif  // INCLUDE_FLAMEGPU_DETAIL_TIMER_H_
diff --git a/include/flamegpu/util/detail/compute_capability.cuh b/include/flamegpu/detail/compute_capability.cuh
similarity index 91%
rename from include/flamegpu/util/detail/compute_capability.cuh
rename to include/flamegpu/detail/compute_capability.cuh
index cdfd6c2dd..c64341c7e 100644
--- a/include/flamegpu/util/detail/compute_capability.cuh
+++ b/include/flamegpu/detail/compute_capability.cuh
@@ -1,14 +1,13 @@
-#ifndef INCLUDE_FLAMEGPU_UTIL_DETAIL_COMPUTE_CAPABILITY_CUH_
-#define INCLUDE_FLAMEGPU_UTIL_DETAIL_COMPUTE_CAPABILITY_CUH_
+#ifndef INCLUDE_FLAMEGPU_DETAIL_COMPUTE_CAPABILITY_CUH_
+#define INCLUDE_FLAMEGPU_DETAIL_COMPUTE_CAPABILITY_CUH_
 
 #include <vector>
 #include <string>
 #include <set>
 
-#include "flamegpu/gpu/detail/CUDAErrorChecking.cuh"
+#include "flamegpu/simulation/detail/CUDAErrorChecking.cuh"
 
 namespace flamegpu {
-namespace util {
 namespace detail {
 namespace compute_capability {
 
@@ -69,7 +68,6 @@ const std::string getDeviceName(int deviceIndex);
 const std::string getDeviceNames(std::set<int> devices);
 }  // namespace compute_capability
 }  // namespace detail
-}  // namespace util
 }  // namespace flamegpu
 
-#endif  // INCLUDE_FLAMEGPU_UTIL_DETAIL_COMPUTE_CAPABILITY_CUH_
+#endif  // INCLUDE_FLAMEGPU_DETAIL_COMPUTE_CAPABILITY_CUH_
diff --git a/include/flamegpu/util/detail/cuda.cuh b/include/flamegpu/detail/cuda.cuh
similarity index 91%
rename from include/flamegpu/util/detail/cuda.cuh
rename to include/flamegpu/detail/cuda.cuh
index 330199459..e3e14d4ca 100644
--- a/include/flamegpu/util/detail/cuda.cuh
+++ b/include/flamegpu/detail/cuda.cuh
@@ -1,5 +1,5 @@
-#ifndef INCLUDE_FLAMEGPU_UTIL_DETAIL_CUDA_CUH_
-#define INCLUDE_FLAMEGPU_UTIL_DETAIL_CUDA_CUH_
+#ifndef INCLUDE_FLAMEGPU_DETAIL_CUDA_CUH_
+#define INCLUDE_FLAMEGPU_DETAIL_CUDA_CUH_
 
 /**
  * Collection of cuda related utility methods for internal use.
@@ -11,7 +11,6 @@
 #include "flamegpu/exception/FLAMEGPUException.h"
 
 namespace flamegpu {
-namespace util {
 namespace detail {
 namespace cuda {
 
@@ -68,7 +67,7 @@ inline cudaError_t cudaFreeHost(void* devPtr) {
 inline bool cuDevicePrimaryContextIsActive(int ordinal) {
     // Throw an exception if a negative device ordinal is passed
     if (ordinal < 0) {
-        THROW exception::InvalidCUDAdevice("CUDA Device ordinals must be non-negative integers, in util::detail::cuda::cuDevicePrimaryContextIsActive()");
+        THROW exception::InvalidCUDAdevice("CUDA Device ordinals must be non-negative integers, in detail::cuda::cuDevicePrimaryContextIsActive()");
     }
 
     int deviceCount = 0;
@@ -78,11 +77,11 @@ inline bool cuDevicePrimaryContextIsActive(int ordinal) {
     if (cuErr == CUDA_SUCCESS) {
         // If the device count is 0, throw.
         if (deviceCount == 0) {
-            THROW exception::InvalidCUDAdevice("Error no CUDA devices found!, in util::detail::cuda::cuDevicePrimaryContextIsActive()");
+            THROW exception::InvalidCUDAdevice("Error no CUDA devices found!, in detail::cuda::cuDevicePrimaryContextIsActive()");
         }
         // If the ordinal is invalid, throw
         if (ordinal >= deviceCount) {
-            THROW exception::InvalidCUDAdevice("Requested CUDA device %d is not valid, only %d CUDA devices available!, in util::detail::cuda::cuDevicePrimaryContextIsActive()", ordinal, deviceCount);
+            THROW exception::InvalidCUDAdevice("Requested CUDA device %d is not valid, only %d CUDA devices available!, in detail::cuda::cuDevicePrimaryContextIsActive()", ordinal, deviceCount);
         }
         // Get the CUdevice handle, silently dismissing any cuErrors as they are falsey
         CUdevice deviceHandle;
@@ -103,7 +102,6 @@ inline bool cuDevicePrimaryContextIsActive(int ordinal) {
 
 }  // namespace cuda
 }  // namespace detail
-}  // namespace util
 }  // namespace flamegpu
 
-#endif  // INCLUDE_FLAMEGPU_UTIL_DETAIL_CUDA_CUH_
+#endif  // INCLUDE_FLAMEGPU_DETAIL_CUDA_CUH_
diff --git a/include/flamegpu/util/detail/curand.cuh b/include/flamegpu/detail/curand.cuh
similarity index 71%
rename from include/flamegpu/util/detail/curand.cuh
rename to include/flamegpu/detail/curand.cuh
index c07ad2ed9..e7c05d068 100644
--- a/include/flamegpu/util/detail/curand.cuh
+++ b/include/flamegpu/detail/curand.cuh
@@ -1,5 +1,5 @@
-#ifndef INCLUDE_FLAMEGPU_UTIL_DETAIL_CURAND_CUH_
-#define INCLUDE_FLAMEGPU_UTIL_DETAIL_CURAND_CUH_
+#ifndef INCLUDE_FLAMEGPU_DETAIL_CURAND_CUH_
+#define INCLUDE_FLAMEGPU_DETAIL_CURAND_CUH_
 
 /**
  * This header exists to allow a convenient way to switch between curand implementations
@@ -8,7 +8,6 @@
 #include <curand_kernel.h>
 
 namespace flamegpu {
-namespace util {
 namespace detail {
 
 #if defined(FLAMEGPU_CURAND_MRG32k3a)
@@ -20,7 +19,6 @@ typedef curandStatePhilox4_32_10_t curandState;
 #endif
 
 }  // namespace detail
-}  // namespace util
 }  // namespace flamegpu
 
-#endif  // INCLUDE_FLAMEGPU_UTIL_DETAIL_CURAND_CUH_
+#endif  // INCLUDE_FLAMEGPU_DETAIL_CURAND_CUH_
diff --git a/include/flamegpu/util/detail/cxxname.hpp b/include/flamegpu/detail/cxxname.hpp
similarity index 81%
rename from include/flamegpu/util/detail/cxxname.hpp
rename to include/flamegpu/detail/cxxname.hpp
index b22ff1953..7008df240 100644
--- a/include/flamegpu/util/detail/cxxname.hpp
+++ b/include/flamegpu/detail/cxxname.hpp
@@ -1,10 +1,9 @@
-#ifndef INCLUDE_FLAMEGPU_UTIL_DETAIL_CXXNAME_HPP_
-#define INCLUDE_FLAMEGPU_UTIL_DETAIL_CXXNAME_HPP_
+#ifndef INCLUDE_FLAMEGPU_DETAIL_CXXNAME_HPP_
+#define INCLUDE_FLAMEGPU_DETAIL_CXXNAME_HPP_
 
 #include <string>
 
 namespace flamegpu {
-namespace util {
 namespace detail {
 namespace cxxname {
 
@@ -26,7 +25,6 @@ inline std::string getUnqualifiedName(std::string qualified) {
 
 }  // namespace cxxname
 }  // namespace detail
-}  // namespace util
 }  // namespace flamegpu
 
-#endif  // INCLUDE_FLAMEGPU_UTIL_DETAIL_CXXNAME_HPP_
+#endif  // INCLUDE_FLAMEGPU_DETAIL_CXXNAME_HPP_
diff --git a/include/flamegpu/util/type_decode.h b/include/flamegpu/detail/type_decode.h
similarity index 78%
rename from include/flamegpu/util/type_decode.h
rename to include/flamegpu/detail/type_decode.h
index b4d8bf29a..99b7f894d 100644
--- a/include/flamegpu/util/type_decode.h
+++ b/include/flamegpu/detail/type_decode.h
@@ -1,5 +1,8 @@
-#ifndef INCLUDE_FLAMEGPU_UTIL_TYPE_DECODE_H_
-#define INCLUDE_FLAMEGPU_UTIL_TYPE_DECODE_H_
+#ifndef INCLUDE_FLAMEGPU_DETAIL_TYPE_DECODE_H_
+#define INCLUDE_FLAMEGPU_DETAIL_TYPE_DECODE_H_
+
+namespace flamegpu {
+namespace detail {
 
 /**
  * This struct allows us to natively decode GLM types to their type + length
@@ -33,4 +36,7 @@ struct type_decode<glm::vec<N, T, Q>> {
 };
 #endif
 
-#endif  // INCLUDE_FLAMEGPU_UTIL_TYPE_DECODE_H_
+}  // namespace detail
+}  // namespace flamegpu
+
+#endif  // INCLUDE_FLAMEGPU_DETAIL_TYPE_DECODE_H_
diff --git a/include/flamegpu/util/detail/wddm.cuh b/include/flamegpu/detail/wddm.cuh
similarity index 80%
rename from include/flamegpu/util/detail/wddm.cuh
rename to include/flamegpu/detail/wddm.cuh
index fa3b584d8..181c514dd 100644
--- a/include/flamegpu/util/detail/wddm.cuh
+++ b/include/flamegpu/detail/wddm.cuh
@@ -1,8 +1,7 @@
-#ifndef INCLUDE_FLAMEGPU_UTIL_DETAIL_WDDM_CUH_
-#define INCLUDE_FLAMEGPU_UTIL_DETAIL_WDDM_CUH_
+#ifndef INCLUDE_FLAMEGPU_DETAIL_WDDM_CUH_
+#define INCLUDE_FLAMEGPU_DETAIL_WDDM_CUH_
 
 namespace flamegpu {
-namespace util {
 namespace detail {
 namespace wddm {
 
@@ -28,7 +27,6 @@ bool deviceIsWDDM();
 
 }  // namespace wddm
 }  // namespace detail
-}  // namespace util
 }  // namespace flamegpu
 
-#endif  // INCLUDE_FLAMEGPU_UTIL_DETAIL_WDDM_CUH_
+#endif  // INCLUDE_FLAMEGPU_DETAIL_WDDM_CUH_
diff --git a/include/flamegpu/exception/FLAMEGPUDeviceException.cuh b/include/flamegpu/exception/FLAMEGPUDeviceException.cuh
index 9182caa10..3c722e7d9 100644
--- a/include/flamegpu/exception/FLAMEGPUDeviceException.cuh
+++ b/include/flamegpu/exception/FLAMEGPUDeviceException.cuh
@@ -4,7 +4,7 @@
 #include <string>
 #include <type_traits>
 
-#include "flamegpu/gpu/CUDAScanCompaction.h"
+#include "flamegpu/simulation/detail/CUDAScanCompaction.h"
 
 #if !defined(FLAMEGPU_SEATBELTS) || FLAMEGPU_SEATBELTS
 
@@ -40,12 +40,12 @@ class DeviceExceptionManager {
      * 1 per stream
      * nullptr until used
      */
-    DeviceExceptionBuffer *d_buffer[CUDAScanCompaction::MAX_STREAMS];
+    DeviceExceptionBuffer *d_buffer[detail::CUDAScanCompaction::MAX_STREAMS];
     /**
      * Host buffers to copy error buffers back to
      * 1 per stream
      */
-    DeviceExceptionBuffer hd_buffer[CUDAScanCompaction::MAX_STREAMS];
+    DeviceExceptionBuffer hd_buffer[detail::CUDAScanCompaction::MAX_STREAMS];
 };
 }  // namespace exception
 }  // namespace flamegpu
diff --git a/include/flamegpu/exception/FLAMEGPUException.h b/include/flamegpu/exception/FLAMEGPUException.h
index 0e14f8265..606a1c428 100644
--- a/include/flamegpu/exception/FLAMEGPUException.h
+++ b/include/flamegpu/exception/FLAMEGPUException.h
@@ -403,7 +403,7 @@ DERIVED_FLAMEGPUException(InvalidFilePath, "File does not exist.");
  */
 DERIVED_FLAMEGPUException(FileAlreadyExists, "File already existst.");
 /**
- * Defines an exception indicating that the flamegpu::util::detail::Timer has been used incorrectly.
+ * Defines an exception indicating that the flamegpu::detail::Timer has been used incorrectly.
  */
 DERIVED_FLAMEGPUException(TimerException, "Invalid use of Timer");
 /**
diff --git a/include/flamegpu/flamegpu.h b/include/flamegpu/flamegpu.h
index 631781265..b007810fd 100644
--- a/include/flamegpu/flamegpu.h
+++ b/include/flamegpu/flamegpu.h
@@ -15,7 +15,7 @@
 // include all host API classes (top level header from each module)
 #include "flamegpu/version.h"
 #include "flamegpu/runtime/HostAPI.h"
-#include "flamegpu/runtime/HostAgentAPI.cuh"
+#include "flamegpu/runtime/agent/HostAgentAPI.cuh"
 #include "flamegpu/runtime/DeviceAPI.cuh"
 #include "flamegpu/model/ModelDescription.h"
 #include "flamegpu/model/AgentDescription.h"
@@ -26,17 +26,17 @@
 #include "flamegpu/model/SubModelDescription.h"
 #include "flamegpu/model/SubAgentDescription.h"
 #include "flamegpu/model/SubEnvironmentDescription.h"
-#include "flamegpu/pop/AgentVector.h"
-#include "flamegpu/pop/AgentInstance.h"
-#include "flamegpu/gpu/CUDASimulation.h"
+#include "flamegpu/simulation/AgentVector.h"
+#include "flamegpu/runtime/agent/AgentInstance.h"
+#include "flamegpu/simulation/CUDASimulation.h"
 #include "flamegpu/runtime/messaging.h"
 #include "flamegpu/runtime/AgentFunction_shim.cuh"
 #include "flamegpu/runtime/AgentFunctionCondition_shim.cuh"
-#include "flamegpu/gpu/CUDAEnsemble.h"
-#include "flamegpu/sim/RunPlanVector.h"
-#include "flamegpu/sim/LoggingConfig.h"
-#include "flamegpu/sim/AgentLoggingConfig.h"
-#include "flamegpu/sim/LogFrame.h"
+#include "flamegpu/simulation/CUDAEnsemble.h"
+#include "flamegpu/simulation/RunPlanVector.h"
+#include "flamegpu/simulation/LoggingConfig.h"
+#include "flamegpu/simulation/AgentLoggingConfig.h"
+#include "flamegpu/simulation/LogFrame.h"
 #include "flamegpu/util/cleanup.h"
 #include "flamegpu/io/Telemetry.h"
 
diff --git a/include/flamegpu/io/JSONLogger.h b/include/flamegpu/io/JSONLogger.h
index 54b277587..13e8edf4a 100644
--- a/include/flamegpu/io/JSONLogger.h
+++ b/include/flamegpu/io/JSONLogger.h
@@ -5,7 +5,7 @@
 #include <typeindex>
 
 #include "flamegpu/io/Logger.h"
-#include "flamegpu/util/Any.h"
+#include "flamegpu/detail/Any.h"
 
 namespace flamegpu {
 struct RunLog;
@@ -129,7 +129,7 @@ class JSONLogger : public Logger{
      * @note Templated as can't forward declare rapidjson::Writer<rapidjson::StringBuffer>
      */
     template<typename T>
-    void writeAny(T &writer, const util::Any &value, unsigned int elements = 1) const;
+    void writeAny(T &writer, const detail::Any &value, unsigned int elements = 1) const;
 
     std::string out_path;
     bool prettyPrint;
diff --git a/include/flamegpu/io/JSONStateReader.h b/include/flamegpu/io/JSONStateReader.h
index e2f0e0a9a..52890833e 100644
--- a/include/flamegpu/io/JSONStateReader.h
+++ b/include/flamegpu/io/JSONStateReader.h
@@ -32,7 +32,7 @@ class JSONStateReader : public StateReader {
     JSONStateReader(
         const std::string &model_name,
         const std::unordered_map<std::string, EnvironmentData::PropData> &env_desc,
-        std::unordered_map<std::string, util::Any>&env_init,
+        std::unordered_map<std::string, detail::Any>&env_init,
         util::StringPairUnorderedMap<std::shared_ptr<AgentVector>> &model_state,
         const std::string &input_file,
         Simulation *sim_instance);
diff --git a/include/flamegpu/io/JSONStateWriter.h b/include/flamegpu/io/JSONStateWriter.h
index ce7aa28d3..773ee71eb 100644
--- a/include/flamegpu/io/JSONStateWriter.h
+++ b/include/flamegpu/io/JSONStateWriter.h
@@ -28,7 +28,7 @@ class JSONStateWriter : public StateWriter {
      */
     JSONStateWriter(
         const std::string &model_name,
-        const std::shared_ptr<EnvironmentManager>& env_manager,
+        const std::shared_ptr<detail::EnvironmentManager>& env_manager,
         const util::StringPairUnorderedMap<std::shared_ptr<AgentVector>> &model_state,
         unsigned int iterations,
         const std::string &output_file,
diff --git a/include/flamegpu/io/StateReader.h b/include/flamegpu/io/StateReader.h
index 3cc4c86cc..14e1d1c76 100644
--- a/include/flamegpu/io/StateReader.h
+++ b/include/flamegpu/io/StateReader.h
@@ -8,7 +8,7 @@
 
 #include "flamegpu/util/StringPair.h"
 #include "flamegpu/model/EnvironmentData.h"
-#include "flamegpu/sim/Simulation.h"
+#include "flamegpu/simulation/Simulation.h"
 
 namespace flamegpu {
 
@@ -37,7 +37,7 @@ class StateReader {
     StateReader(
         const std::string& _model_name,
         const std::unordered_map<std::string, EnvironmentData::PropData>& _env_desc,
-        std::unordered_map<std::string, util::Any>& _env_init,
+        std::unordered_map<std::string, detail::Any>& _env_init,
         util::StringPairUnorderedMap<std::shared_ptr<AgentVector>>& _model_state,
         const std::string& input,
         Simulation* _sim_instance)
@@ -67,7 +67,7 @@ class StateReader {
     std::string inputFile;
     const std::string model_name;
     const std::unordered_map<std::string, EnvironmentData::PropData> &env_desc;
-    std::unordered_map<std::string, util::Any>& env_init;
+    std::unordered_map<std::string, detail::Any>& env_init;
     Simulation *sim_instance;
 };
 }  // namespace io
diff --git a/include/flamegpu/io/StateReaderFactory.h b/include/flamegpu/io/StateReaderFactory.h
index 9fa9e30ce..208cbf8a5 100644
--- a/include/flamegpu/io/StateReaderFactory.h
+++ b/include/flamegpu/io/StateReaderFactory.h
@@ -38,7 +38,7 @@ class StateReaderFactory {
     static StateReader* createReader(
         const std::string& model_name,
         const std::unordered_map<std::string, EnvironmentData::PropData>& env_desc,
-        std::unordered_map<std::string, util::Any>& env_init,
+        std::unordered_map<std::string, detail::Any>& env_init,
         util::StringPairUnorderedMap<std::shared_ptr<AgentVector>>& model_state,
         const std::string& input,
         Simulation* sim_instance) {
diff --git a/include/flamegpu/io/StateWriter.h b/include/flamegpu/io/StateWriter.h
index f166bea6e..b626fa0ce 100644
--- a/include/flamegpu/io/StateWriter.h
+++ b/include/flamegpu/io/StateWriter.h
@@ -10,9 +10,10 @@
 #include "flamegpu/util/StringPair.h"
 
 namespace flamegpu {
-
-class AgentVector;
+namespace detail {
 class EnvironmentManager;
+}  // namespace detail
+class AgentVector;
 class Simulation;
 
 namespace io {
@@ -35,7 +36,7 @@ class StateWriter {
      * @param _sim_instance Instance of the simulation (for configuration data IO)
      */
     StateWriter(const std::string &_model_name,
-        const std::shared_ptr<EnvironmentManager>& _env_manager,
+        const std::shared_ptr<detail::EnvironmentManager>& _env_manager,
         const util::StringPairUnorderedMap<std::shared_ptr<AgentVector>> &_model_state,
         const unsigned int _iterations,
         const std::string &output_file,
@@ -67,7 +68,7 @@ class StateWriter {
     unsigned int iterations;
     std::string outputFile;
     const std::string model_name;
-    const std::shared_ptr<EnvironmentManager> env_manager;
+    const std::shared_ptr<detail::EnvironmentManager> env_manager;
     const Simulation *sim_instance;
 };
 }  // namespace io
diff --git a/include/flamegpu/io/StateWriterFactory.h b/include/flamegpu/io/StateWriterFactory.h
index 222626961..8ca00afe7 100644
--- a/include/flamegpu/io/StateWriterFactory.h
+++ b/include/flamegpu/io/StateWriterFactory.h
@@ -39,7 +39,7 @@ class StateWriterFactory {
      */
     static StateWriter* createWriter(
         const std::string& model_name,
-        const std::shared_ptr<EnvironmentManager>& env_manager,
+        const std::shared_ptr<detail::EnvironmentManager>& env_manager,
         const util::StringPairUnorderedMap<std::shared_ptr<AgentVector>>& model_state,
         const unsigned int iterations,
         const std::string& output_file,
diff --git a/include/flamegpu/io/XMLLogger.h b/include/flamegpu/io/XMLLogger.h
index 583bcc4a8..804966bf1 100644
--- a/include/flamegpu/io/XMLLogger.h
+++ b/include/flamegpu/io/XMLLogger.h
@@ -5,7 +5,7 @@
 #include <typeindex>
 
 #include "flamegpu/io/Logger.h"
-#include "flamegpu/util/Any.h"
+#include "flamegpu/detail/Any.h"
 
 namespace tinyxml2 {
 class XMLNode;
@@ -113,7 +113,7 @@ class XMLLogger : public Logger{
      * @tparam T Instance of rapidjson::Writer or subclass (e.g. rapidjson::PrettyWriter)
      * @note Templated as can't forward declare rapidjson::Writer<rapidjson::StringBuffer>
      */
-    void writeAny(tinyxml2::XMLElement *element, const util::Any &value, unsigned int elements = 1) const;
+    void writeAny(tinyxml2::XMLElement *element, const detail::Any &value, unsigned int elements = 1) const;
 
     std::string out_path;
     bool prettyPrint;
diff --git a/include/flamegpu/io/XMLStateReader.h b/include/flamegpu/io/XMLStateReader.h
index e016046b7..af1f883a6 100644
--- a/include/flamegpu/io/XMLStateReader.h
+++ b/include/flamegpu/io/XMLStateReader.h
@@ -31,7 +31,7 @@ class XMLStateReader : public StateReader {
     XMLStateReader(
         const std::string &model_name,
         const std::unordered_map<std::string, EnvironmentData::PropData> &env_desc,
-        std::unordered_map<std::string, util::Any> &env_init,
+        std::unordered_map<std::string, detail::Any> &env_init,
         util::StringPairUnorderedMap<std::shared_ptr<AgentVector>> &model_state,
         const std::string &input_file,
         Simulation *sim_instance);
diff --git a/include/flamegpu/io/XMLStateWriter.h b/include/flamegpu/io/XMLStateWriter.h
index 6c549cb2d..b75607283 100644
--- a/include/flamegpu/io/XMLStateWriter.h
+++ b/include/flamegpu/io/XMLStateWriter.h
@@ -28,7 +28,7 @@ class XMLStateWriter : public StateWriter {
      */
     XMLStateWriter(
         const std::string &model_name,
-        const std::shared_ptr<EnvironmentManager>& env_manager,
+        const std::shared_ptr<detail::EnvironmentManager>& env_manager,
         const util::StringPairUnorderedMap<std::shared_ptr<AgentVector>> &model_state,
         unsigned int iterations,
         const std::string &output_file,
diff --git a/include/flamegpu/model/AgentDescription.h b/include/flamegpu/model/AgentDescription.h
index ec7c3cecb..5f40eb91b 100644
--- a/include/flamegpu/model/AgentDescription.h
+++ b/include/flamegpu/model/AgentDescription.h
@@ -10,10 +10,10 @@
 
 #include "flamegpu/model/Variable.h"
 #include "flamegpu/model/ModelDescription.h"
-#include "flamegpu/pop/AgentVector.h"
-#include "flamegpu/pop/AgentInstance.h"
+#include "flamegpu/simulation/AgentVector.h"
+#include "flamegpu/runtime/agent/AgentInstance.h"
 #include "flamegpu/model/AgentData.h"
-#include "flamegpu/util/type_decode.h"
+#include "flamegpu/detail/type_decode.h"
 
 namespace flamegpu {
 
@@ -331,10 +331,10 @@ void AgentDescription::newVariable(const std::string &variable_name, const std::
             "in AgentDescription::newVariable().");
     }
     // Array length 0 makes no sense
-    static_assert(type_decode<T>::len_t * N > 0, "A variable cannot have 0 elements.");
+    static_assert(detail::type_decode<T>::len_t * N > 0, "A variable cannot have 0 elements.");
     if (agent->variables.find(variable_name) == agent->variables.end()) {
-        const std::array<typename type_decode<T>::type_t, type_decode<T>::len_t * N> *casted_default =
-        reinterpret_cast<const std::array<typename type_decode<T>::type_t, type_decode<T>::len_t* N>*>(&default_value);
+        const std::array<typename detail::type_decode<T>::type_t, detail::type_decode<T>::len_t * N> *casted_default =
+        reinterpret_cast<const std::array<typename detail::type_decode<T>::type_t, detail::type_decode<T>::len_t* N>*>(&default_value);
         agent->variables.emplace(variable_name, Variable(*casted_default));
         return;
     }
@@ -370,11 +370,11 @@ void AgentDescription::newVariableArray(const std::string& variable_name, const
             length, static_cast<unsigned int>(default_value.size()));
     }
     if (agent->variables.find(variable_name) == agent->variables.end()) {
-        std::vector<typename type_decode<T>::type_t> temp(static_cast<size_t>(type_decode<T>::len_t * length));
+        std::vector<typename detail::type_decode<T>::type_t> temp(static_cast<size_t>(detail::type_decode<T>::len_t * length));
         if (default_value.size()) {
-            memcpy(temp.data(), default_value.data(), sizeof(typename type_decode<T>::type_t) * type_decode<T>::len_t * length);
+            memcpy(temp.data(), default_value.data(), sizeof(typename detail::type_decode<T>::type_t) * detail::type_decode<T>::len_t * length);
         }
-        agent->variables.emplace(variable_name, Variable(type_decode<T>::len_t* length, temp));
+        agent->variables.emplace(variable_name, Variable(detail::type_decode<T>::len_t* length, temp));
         return;
     }
     THROW exception::InvalidAgentVar("Agent ('%s') already contains variable '%s', "
diff --git a/include/flamegpu/model/EnvironmentData.h b/include/flamegpu/model/EnvironmentData.h
index 9f1af0c42..5e96699fa 100644
--- a/include/flamegpu/model/EnvironmentData.h
+++ b/include/flamegpu/model/EnvironmentData.h
@@ -8,7 +8,7 @@
 #include <typeindex>
 #include <unordered_map>
 
-#include "flamegpu/util/Any.h"
+#include "flamegpu/detail/Any.h"
 #include "flamegpu/model/ModelData.h"
 
 namespace flamegpu {
@@ -25,7 +25,7 @@ struct EnvironmentData {
      */
     // friend class CUDASimulation;
 
-    // friend class SimRunner;
+    // friend class detail::SimRunner;
     // friend unsigned int CUDAEnsemble::simulate(const RunPlanVector& plans);
     /**
      * Holds all of the properties required to add a value to EnvironmentManager
@@ -35,11 +35,11 @@ struct EnvironmentData {
          * @param _is_const Is the property constant
          * @param _data The data to initially fill the property with
          */
-        PropData(bool _is_const, const util::Any& _data)
+        PropData(bool _is_const, const detail::Any& _data)
             : isConst(_is_const)
             , data(_data) { }
         bool isConst;
-        const util::Any data;
+        const detail::Any data;
         bool operator==(const PropData& rhs) const {
             if (this == &rhs)
                 return true;
diff --git a/include/flamegpu/model/EnvironmentDescription.h b/include/flamegpu/model/EnvironmentDescription.h
index e943011be..31436ec12 100644
--- a/include/flamegpu/model/EnvironmentDescription.h
+++ b/include/flamegpu/model/EnvironmentDescription.h
@@ -9,11 +9,11 @@
 #include <memory>
 
 #include "flamegpu/exception/FLAMEGPUException.h"
-#include "flamegpu/runtime/utility/HostEnvironment.cuh"
-#include "flamegpu/util/Any.h"
+#include "flamegpu/runtime/environment/HostEnvironment.cuh"
+#include "flamegpu/detail/Any.h"
 #include "flamegpu/model/EnvironmentData.h"
-#include "flamegpu/util/type_decode.h"
-#include "flamegpu/gpu/CUDAEnsemble.h"
+#include "flamegpu/detail/type_decode.h"
+#include "flamegpu/simulation/CUDAEnsemble.h"
 
 namespace flamegpu {
 
@@ -272,14 +272,14 @@ void EnvironmentDescription::newProperty(const std::string &name, T value, bool
     }
     // Limited to Arithmetic types
     // Compound types would allow host pointers inside structs to be passed
-    static_assert(std::is_arithmetic<typename type_decode<T>::type_t>::value || std::is_enum<typename type_decode<T>::type_t>::value,
+    static_assert(std::is_arithmetic<typename detail::type_decode<T>::type_t>::value || std::is_enum<typename detail::type_decode<T>::type_t>::value,
         "Only arithmetic types can be used as environmental properties");
     if (environment->properties.find(name) != environment->properties.end()) {
         THROW exception::DuplicateEnvProperty("Environmental property with name '%s' already exists, "
             "in EnvironmentDescription::newProperty().",
             name.c_str());
     }
-    newProperty(name, reinterpret_cast<const char*>(&value), sizeof(T), isConst, type_decode<T>::len_t, typeid(typename type_decode<T>::type_t));
+    newProperty(name, reinterpret_cast<const char*>(&value), sizeof(T), isConst, detail::type_decode<T>::len_t, typeid(typename detail::type_decode<T>::type_t));
 }
 template<typename T, flamegpu::size_type N>
 void EnvironmentDescription::newProperty(const std::string &name, const std::array<T, N> &value, bool isConst) {
@@ -287,17 +287,17 @@ void EnvironmentDescription::newProperty(const std::string &name, const std::arr
         THROW exception::ReservedName("Environment property names cannot begin with '_', this is reserved for internal usage, "
             "in EnvironmentDescription::newProperty().");
     }
-    static_assert(type_decode<T>::len_t * N > 0, "Environment property arrays must have a length greater than 0.");
+    static_assert(detail::type_decode<T>::len_t * N > 0, "Environment property arrays must have a length greater than 0.");
     // Limited to Arithmetic types
     // Compound types would allow host pointers inside structs to be passed
-    static_assert(std::is_arithmetic<typename type_decode<T>::type_t>::value || std::is_enum<typename type_decode<T>::type_t>::value,
+    static_assert(std::is_arithmetic<typename detail::type_decode<T>::type_t>::value || std::is_enum<typename detail::type_decode<T>::type_t>::value,
         "Only arithmetic types can be used as environmental properties");
     if (environment->properties.find(name) != environment->properties.end()) {
         THROW exception::DuplicateEnvProperty("Environmental property with name '%s' already exists, "
             "in EnvironmentDescription::newProperty().",
             name.c_str());
     }
-    newProperty(name, reinterpret_cast<const char*>(value.data()), N * sizeof(T), isConst, type_decode<T>::len_t * N, typeid(typename type_decode<T>::type_t));
+    newProperty(name, reinterpret_cast<const char*>(value.data()), N * sizeof(T), isConst, detail::type_decode<T>::len_t * N, typeid(typename detail::type_decode<T>::type_t));
 }
 #ifdef SWIG
 template<typename T>
@@ -312,14 +312,14 @@ void EnvironmentDescription::newPropertyArray(const std::string &name, const std
     }
     // Limited to Arithmetic types
     // Compound types would allow host pointers inside structs to be passed
-    static_assert(std::is_arithmetic<typename type_decode<T>::type_t>::value || std::is_enum<typename type_decode<T>::type_t>::value,
+    static_assert(std::is_arithmetic<typename detail::type_decode<T>::type_t>::value || std::is_enum<typename detail::type_decode<T>::type_t>::value,
         "Only arithmetic types can be used as environmental properties");
     if (environment->properties.find(name) != environment->properties.end()) {
         THROW exception::DuplicateEnvProperty("Environmental property with name '%s' already exists, "
             "in EnvironmentDescription::newPropertyArray().",
             name.c_str());
     }
-    newProperty(name, reinterpret_cast<const char*>(value.data()), value.size() * sizeof(T), isConst, type_decode<T>::len_t * value.size(), typeid(typename type_decode<T>::type_t));
+    newProperty(name, reinterpret_cast<const char*>(value.data()), value.size() * sizeof(T), isConst, detail::type_decode<T>::len_t * value.size(), typeid(typename detail::type_decode<T>::type_t));
 }
 #endif
 /**
@@ -329,19 +329,19 @@ template<typename T>
 T CEnvironmentDescription::getProperty(const std::string &name) const {
     // Limited to Arithmetic types
     // Compound types would allow host pointers inside structs to be passed
-    static_assert(std::is_arithmetic<typename type_decode<T>::type_t>::value || std::is_enum<typename type_decode<T>::type_t>::value,
+    static_assert(std::is_arithmetic<typename detail::type_decode<T>::type_t>::value || std::is_enum<typename detail::type_decode<T>::type_t>::value,
         "Only arithmetic types can be used as environmental properties");
     auto &&i = environment->properties.find(name);
     if (i != environment->properties.end()) {
-        if (i->second.data.type != std::type_index(typeid(typename type_decode<T>::type_t))) {
+        if (i->second.data.type != std::type_index(typeid(typename detail::type_decode<T>::type_t))) {
             THROW exception::InvalidEnvPropertyType("Environmental property ('%s') type (%s) does not match template argument T (%s), "
                 "in EnvironmentDescription::getProperty().",
-                name.c_str(), i->second.data.type.name(), typeid(typename type_decode<T>::type_t).name());
+                name.c_str(), i->second.data.type.name(), typeid(typename detail::type_decode<T>::type_t).name());
         }
-        if (i->second.data.elements != type_decode<T>::len_t) {
+        if (i->second.data.elements != detail::type_decode<T>::len_t) {
             THROW exception::InvalidEnvPropertyType("Length of named environmental property (%u) does not match vector length (%u), "
                 "in EnvironmentDescription::getProperty().",
-                i->second.data.elements, type_decode<T>::len_t);
+                i->second.data.elements, detail::type_decode<T>::len_t);
         }
         return *reinterpret_cast<T*>(i->second.data.ptr);
     }
@@ -353,19 +353,19 @@ template<typename T, flamegpu::size_type N>
 std::array<T, N> CEnvironmentDescription::getProperty(const std::string &name) const {
     // Limited to Arithmetic types
     // Compound types would allow host pointers inside structs to be passed
-    static_assert(std::is_arithmetic<typename type_decode<T>::type_t>::value || std::is_enum<typename type_decode<T>::type_t>::value,
+    static_assert(std::is_arithmetic<typename detail::type_decode<T>::type_t>::value || std::is_enum<typename detail::type_decode<T>::type_t>::value,
         "Only arithmetic types can be used as environmental properties");
     auto &&i = environment->properties.find(name);
     if (i != environment->properties.end()) {
-        if (i->second.data.type != std::type_index(typeid(typename type_decode<T>::type_t))) {
+        if (i->second.data.type != std::type_index(typeid(typename detail::type_decode<T>::type_t))) {
             THROW exception::InvalidEnvPropertyType("Environmental property array ('%s') type (%s) does not match template argument T (%s), "
                 "in EnvironmentDescription::getProperty().",
-                name.c_str(), i->second.data.type.name(), typeid(typename type_decode<T>::type_t).name());
+                name.c_str(), i->second.data.type.name(), typeid(typename detail::type_decode<T>::type_t).name());
         }
-        if (i->second.data.elements != type_decode<T>::len_t * N) {
+        if (i->second.data.elements != detail::type_decode<T>::len_t * N) {
             THROW exception::InvalidEnvPropertyType("Length of named environmental property array (%u) does not match requested length (%u), "
                 "in EnvironmentDescription::getProperty().",
-                i->second.data.elements, type_decode<T>::len_t * N);
+                i->second.data.elements, detail::type_decode<T>::len_t * N);
         }
         // Copy old data to return
         std::array<T, N> rtn;
@@ -380,25 +380,25 @@ template<typename T>
 T CEnvironmentDescription::getProperty(const std::string &name, flamegpu::size_type index) const {
     // Limited to Arithmetic types
     // Compound types would allow host pointers inside structs to be passed
-    static_assert(std::is_arithmetic<typename type_decode<T>::type_t>::value || std::is_enum<typename type_decode<T>::type_t>::value,
+    static_assert(std::is_arithmetic<typename detail::type_decode<T>::type_t>::value || std::is_enum<typename detail::type_decode<T>::type_t>::value,
         "Only arithmetic types can be used as environmental properties");
     auto &&i = environment->properties.find(name);
     if (i != environment->properties.end()) {
-        if (i->second.data.type != std::type_index(typeid(typename type_decode<T>::type_t))) {
+        if (i->second.data.type != std::type_index(typeid(typename detail::type_decode<T>::type_t))) {
             THROW exception::InvalidEnvPropertyType("Environmental property array ('%s') type (%s) does not match template argument T (%s), "
                 "in EnvironmentDescription::getProperty().",
-                name.c_str(), i->second.data.type.name(), typeid(typename type_decode<T>::type_t).name());
+                name.c_str(), i->second.data.type.name(), typeid(typename detail::type_decode<T>::type_t).name());
         }
-        if (i->second.data.elements % type_decode<T>::len_t != 0) {
+        if (i->second.data.elements % detail::type_decode<T>::len_t != 0) {
             THROW exception::InvalidEnvPropertyType("Environmental property array ('%s') length (%u) does not divide by vector length (%u), "
                 "in EnvironmentDescription::getPropertyArray().",
-                name.c_str(), i->second.data.elements, type_decode<T>::len_t);
+                name.c_str(), i->second.data.elements, detail::type_decode<T>::len_t);
         }
-        const unsigned int t_index = type_decode<T>::len_t * index + type_decode<T>::len_t;
+        const unsigned int t_index = detail::type_decode<T>::len_t * index + detail::type_decode<T>::len_t;
         if (i->second.data.elements < t_index || t_index < index) {
             THROW exception::OutOfBoundsException("Index (%u) exceeds named environmental property array's length (%u), "
                 "in EnvironmentDescription::getProperty().",
-                index, i->second.data.elements / type_decode<T>::len_t);
+                index, i->second.data.elements / detail::type_decode<T>::len_t);
         }
         // Copy old data to return
         return *(reinterpret_cast<T*>(i->second.data.ptr) + index);
@@ -412,23 +412,23 @@ template<typename T>
 std::vector<T> CEnvironmentDescription::getPropertyArray(const std::string& name) const {
     // Limited to Arithmetic types
     // Compound types would allow host pointers inside structs to be passed
-    static_assert(std::is_arithmetic<typename type_decode<T>::type_t>::value || std::is_enum<typename type_decode<T>::type_t>::value,
+    static_assert(std::is_arithmetic<typename detail::type_decode<T>::type_t>::value || std::is_enum<typename detail::type_decode<T>::type_t>::value,
         "Only arithmetic types can be used as environmental properties");
     auto &&i = environment->properties.find(name);
     if (i != environment->properties.end()) {
-        if (i->second.data.type != std::type_index(typeid(typename type_decode<T>::type_t))) {
+        if (i->second.data.type != std::type_index(typeid(typename detail::type_decode<T>::type_t))) {
             THROW exception::InvalidEnvPropertyType("Environmental property array ('%s') type (%s) does not match template argument T (%s), "
                 "in EnvironmentDescription::getPropertyArray().",
-                name.c_str(), i->second.data.type.name(), typeid(typename type_decode<T>::type_t).name());
+                name.c_str(), i->second.data.type.name(), typeid(typename detail::type_decode<T>::type_t).name());
         }
-        if (i->second.data.elements % type_decode<T>::len_t != 0) {
+        if (i->second.data.elements % detail::type_decode<T>::len_t != 0) {
             THROW exception::InvalidEnvPropertyType("Environmental property array ('%s') length (%u) does not divide by vector length (%d), "
                 "in EnvironmentDescription::getPropertyArray().",
-                name.c_str(), i->second.data.elements, type_decode<T>::len_t);
+                name.c_str(), i->second.data.elements, detail::type_decode<T>::len_t);
         }
         // Copy old data to return
-        std::vector<T> rtn(i->second.data.elements / type_decode<T>::len_t);
-        memcpy(rtn.data(), reinterpret_cast<T*>(i->second.data.ptr), i->second.data.elements * sizeof(typename type_decode<T>::type_t));
+        std::vector<T> rtn(i->second.data.elements / detail::type_decode<T>::len_t);
+        memcpy(rtn.data(), reinterpret_cast<T*>(i->second.data.ptr), i->second.data.elements * sizeof(typename detail::type_decode<T>::type_t));
         return rtn;
     }
     THROW exception::InvalidEnvProperty("Environmental property with name '%s' does not exist, "
@@ -448,19 +448,19 @@ T EnvironmentDescription::setProperty(const std::string &name, T value) {
     }
     // Limited to Arithmetic types
     // Compound types would allow host pointers inside structs to be passed
-    static_assert(std::is_arithmetic<typename type_decode<T>::type_t>::value || std::is_enum<typename type_decode<T>::type_t>::value,
+    static_assert(std::is_arithmetic<typename detail::type_decode<T>::type_t>::value || std::is_enum<typename detail::type_decode<T>::type_t>::value,
         "Only arithmetic types can be used as environmental properties");
     auto &&i = environment->properties.find(name);
     if (i != environment->properties.end()) {
-        if (i->second.data.type != std::type_index(typeid(typename type_decode<T>::type_t))) {
+        if (i->second.data.type != std::type_index(typeid(typename detail::type_decode<T>::type_t))) {
             THROW exception::InvalidEnvPropertyType("Environmental property ('%s') type (%s) does not match template argument T (%s), "
                 "in EnvironmentDescription::setProperty().",
-                name.c_str(), i->second.data.type.name(), typeid(typename type_decode<T>::type_t).name());
+                name.c_str(), i->second.data.type.name(), typeid(typename detail::type_decode<T>::type_t).name());
         }
-        if (i->second.data.elements != type_decode<T>::len_t) {
+        if (i->second.data.elements != detail::type_decode<T>::len_t) {
             THROW exception::InvalidEnvPropertyType("Length of named environmental property (%u) does not match vector length (%u), "
                 "in EnvironmentDescription::setProperty().",
-                i->second.data.elements, type_decode<T>::len_t);
+                i->second.data.elements, detail::type_decode<T>::len_t);
         }
         // Copy old data to return
         T rtn = *reinterpret_cast<T*>(i->second.data.ptr);
@@ -480,19 +480,19 @@ std::array<T, N> EnvironmentDescription::setProperty(const std::string &name, co
     }
     // Limited to Arithmetic types
     // Compound types would allow host pointers inside structs to be passed
-    static_assert(std::is_arithmetic<typename type_decode<T>::type_t>::value || std::is_enum<typename type_decode<T>::type_t>::value,
+    static_assert(std::is_arithmetic<typename detail::type_decode<T>::type_t>::value || std::is_enum<typename detail::type_decode<T>::type_t>::value,
         "Only arithmetic types can be used as environmental properties");
     auto &&i = environment->properties.find(name);
     if (i != environment->properties.end()) {
-        if (i->second.data.type != std::type_index(typeid(typename type_decode<T>::type_t))) {
+        if (i->second.data.type != std::type_index(typeid(typename detail::type_decode<T>::type_t))) {
             THROW exception::InvalidEnvPropertyType("Environmental property array ('%s') type (%s) does not match template argument T (%s), "
                 "in EnvironmentDescription::setProperty().",
-                name.c_str(), i->second.data.type.name(), typeid(typename type_decode<T>::type_t).name());
+                name.c_str(), i->second.data.type.name(), typeid(typename detail::type_decode<T>::type_t).name());
         }
-        if (i->second.data.elements != N * type_decode<T>::len_t) {
+        if (i->second.data.elements != N * detail::type_decode<T>::len_t) {
             THROW exception::InvalidEnvPropertyType("Length of named environmental property array (%u) does not match requested length (%u), "
                 "in EnvironmentDescription::setProperty().",
-                i->second.data.elements, N * type_decode<T>::len_t);
+                i->second.data.elements, N * detail::type_decode<T>::len_t);
         }
         // Copy old data to return
         std::array<T, N> rtn;
@@ -513,25 +513,25 @@ T EnvironmentDescription::setProperty(const std::string &name, flamegpu::size_ty
     }
     // Limited to Arithmetic types
     // Compound types would allow host pointers inside structs to be passed
-    static_assert(std::is_arithmetic<typename type_decode<T>::type_t>::value || std::is_enum<typename type_decode<T>::type_t>::value,
+    static_assert(std::is_arithmetic<typename detail::type_decode<T>::type_t>::value || std::is_enum<typename detail::type_decode<T>::type_t>::value,
         "Only arithmetic types can be used as environmental properties");
     auto &&i = environment->properties.find(name);
     if (i != environment->properties.end()) {
-        if (i->second.data.type != std::type_index(typeid(typename type_decode<T>::type_t))) {
+        if (i->second.data.type != std::type_index(typeid(typename detail::type_decode<T>::type_t))) {
             THROW exception::InvalidEnvPropertyType("Environmental property array ('%s') type (%s) does not match template argument T (%s), "
                 "in EnvironmentDescription::setProperty().",
-                name.c_str(), i->second.data.type.name(), typeid(typename type_decode<T>::type_t).name());
+                name.c_str(), i->second.data.type.name(), typeid(typename detail::type_decode<T>::type_t).name());
         }
-        if (i->second.data.elements % type_decode<T>::len_t != 0) {
+        if (i->second.data.elements % detail::type_decode<T>::len_t != 0) {
             THROW exception::InvalidEnvPropertyType("Environmental property array ('%s') length (%u) does not divide by vector length (%u), "
                 "in EnvironmentDescription::setProperty().",
-                name.c_str(), i->second.data.elements, type_decode<T>::len_t);
+                name.c_str(), i->second.data.elements, detail::type_decode<T>::len_t);
         }
-        const unsigned int t_index = type_decode<T>::len_t * index + type_decode<T>::len_t;
+        const unsigned int t_index = detail::type_decode<T>::len_t * index + detail::type_decode<T>::len_t;
         if (i->second.data.elements < t_index || t_index < index) {
             THROW exception::OutOfBoundsException("Index (%u) exceeds named environmental property array's length (%u), "
                 "in EnvironmentDescription::setProperty().",
-                index, i->second.data.elements / type_decode<T>::len_t);
+                index, i->second.data.elements / detail::type_decode<T>::len_t);
         }
         // Copy old data to return
         T rtn = *(reinterpret_cast<T*>(i->second.data.ptr) +  index);
@@ -552,30 +552,30 @@ std::vector<T> EnvironmentDescription::setPropertyArray(const std::string& name,
     }
     // Limited to Arithmetic types
     // Compound types would allow host pointers inside structs to be passed
-    static_assert(std::is_arithmetic<typename type_decode<T>::type_t>::value || std::is_enum<typename type_decode<T>::type_t>::value,
+    static_assert(std::is_arithmetic<typename detail::type_decode<T>::type_t>::value || std::is_enum<typename detail::type_decode<T>::type_t>::value,
         "Only arithmetic types can be used as environmental properties");
     auto &&i = environment->properties.find(name);
     if (i != environment->properties.end()) {
-        if (i->second.data.type != std::type_index(typeid(typename type_decode<T>::type_t))) {
+        if (i->second.data.type != std::type_index(typeid(typename detail::type_decode<T>::type_t))) {
             THROW exception::InvalidEnvPropertyType("Environmental property array ('%s') type (%s) does not match template argument T (%s), "
                 "in EnvironmentDescription::setPropertyArray().",
-                name.c_str(), i->second.data.type.name(), typeid(typename type_decode<T>::type_t).name());
+                name.c_str(), i->second.data.type.name(), typeid(typename detail::type_decode<T>::type_t).name());
         }
-        if (i->second.data.elements % type_decode<T>::len_t != 0) {
+        if (i->second.data.elements % detail::type_decode<T>::len_t != 0) {
             THROW exception::InvalidEnvPropertyType("Environmental property array ('%s') length (%u) does not divide by vector length (%u), "
                 "in EnvironmentDescription::setPropertyArray().",
-                name.c_str(), i->second.data.elements, type_decode<T>::len_t);
+                name.c_str(), i->second.data.elements, detail::type_decode<T>::len_t);
         }
-        if (i->second.data.elements != value.size() * type_decode<T>::len_t) {
+        if (i->second.data.elements != value.size() * detail::type_decode<T>::len_t) {
             THROW exception::OutOfBoundsException("Length of named environmental property array (%u) does not match length of provided vector (%llu), "
                 "in EnvironmentDescription::setPropertyArray().",
-                i->second.data.elements / type_decode<T>::len_t, value.size());
+                i->second.data.elements / detail::type_decode<T>::len_t, value.size());
         }
         // Copy old data to return
-        std::vector<T> rtn(i->second.data.elements / type_decode<T>::len_t);
-        memcpy(rtn.data(), reinterpret_cast<T*>(i->second.data.ptr), i->second.data.elements * sizeof(typename type_decode<T>::type_t));
+        std::vector<T> rtn(i->second.data.elements / detail::type_decode<T>::len_t);
+        memcpy(rtn.data(), reinterpret_cast<T*>(i->second.data.ptr), i->second.data.elements * sizeof(typename detail::type_decode<T>::type_t));
         // Store data
-        memcpy(reinterpret_cast<T*>(i->second.data.ptr), value.data(), i->second.data.elements * sizeof(typename type_decode<T>::type_t));
+        memcpy(reinterpret_cast<T*>(i->second.data.ptr), value.data(), i->second.data.elements * sizeof(typename detail::type_decode<T>::type_t));
         return rtn;
     }
     THROW exception::InvalidEnvProperty("Environmental property with name '%s' does not exist, "
diff --git a/include/flamegpu/model/Variable.h b/include/flamegpu/model/Variable.h
index fa79a093d..76f53d969 100644
--- a/include/flamegpu/model/Variable.h
+++ b/include/flamegpu/model/Variable.h
@@ -9,7 +9,7 @@
 #include <cstring>
 #include <vector>
 
-#include "flamegpu/pop/detail/MemoryVector.h"
+#include "flamegpu/simulation/detail/MemoryVector.h"
 
 namespace flamegpu {
 
diff --git a/include/flamegpu/pop/DeviceAgentVector.h b/include/flamegpu/pop/DeviceAgentVector.h
deleted file mode 100644
index 9a08baaa1..000000000
--- a/include/flamegpu/pop/DeviceAgentVector.h
+++ /dev/null
@@ -1,19 +0,0 @@
-#ifndef INCLUDE_FLAMEGPU_POP_DEVICEAGENTVECTOR_H_
-#define INCLUDE_FLAMEGPU_POP_DEVICEAGENTVECTOR_H_
-
-#include "flamegpu/pop/AgentVector.h"
-#include "flamegpu/pop/DeviceAgentVector_impl.h"
-
-namespace flamegpu {
-
-/**
- * This acts as a reference to DeviceAgentVector_impl
- * That class cannot be copied or assigned so it is accessed via a reference wrapper
- *
- * @see DeviceAgentVector_impl
- */
-typedef DeviceAgentVector_impl& DeviceAgentVector;
-
-}  // namespace flamegpu
-
-#endif  // INCLUDE_FLAMEGPU_POP_DEVICEAGENTVECTOR_H_
diff --git a/include/flamegpu/runtime/AgentFunction.cuh b/include/flamegpu/runtime/AgentFunction.cuh
index 7dd10c321..ca9e2271d 100644
--- a/include/flamegpu/runtime/AgentFunction.cuh
+++ b/include/flamegpu/runtime/AgentFunction.cuh
@@ -4,7 +4,7 @@
 #include <cuda_runtime.h>
 #include <device_launch_parameters.h>
 
-#include "flamegpu/util/detail/curand.cuh"
+#include "flamegpu/detail/curand.cuh"
 #include "flamegpu/runtime/detail/SharedBlock.h"
 #include "flamegpu/defines.h"
 #include "flamegpu/exception/FLAMEGPUDeviceException.cuh"
@@ -30,7 +30,7 @@ typedef void(AgentFunctionWrapper)(
     const unsigned int popNo,
     const void *in_messagelist_metadata,
     const void *out_messagelist_metadata,
-    util::detail::curandState *d_rng,
+    detail::curandState *d_rng,
     unsigned int *scanFlag_agentDeath,
     unsigned int *scanFlag_messageOutput,
     unsigned int *scanFlag_agentOutput);  // Can't put __global__ in a typedef
@@ -66,7 +66,7 @@ __global__ void agent_function_wrapper(
     const unsigned int popNo,
     const void *in_messagelist_metadata,
     const void *out_messagelist_metadata,
-    util::detail::curandState *d_rng,
+    detail::curandState *d_rng,
     unsigned int *scanFlag_agentDeath,
     unsigned int *scanFlag_messageOutput,
     unsigned int *scanFlag_agentOutput) {
diff --git a/include/flamegpu/runtime/AgentFunctionCondition.cuh b/include/flamegpu/runtime/AgentFunctionCondition.cuh
index b100d9123..88c3c97df 100644
--- a/include/flamegpu/runtime/AgentFunctionCondition.cuh
+++ b/include/flamegpu/runtime/AgentFunctionCondition.cuh
@@ -22,7 +22,7 @@ typedef void(AgentFunctionConditionWrapper)(
     const char* d_env_buffer,
 #endif
     const unsigned int popNo,
-    util::detail::curandState *d_rng,
+    detail::curandState *d_rng,
     unsigned int *scanFlag_conditionResult);  // Can't put __global__ in a typedef
 
 /**
@@ -47,7 +47,7 @@ __global__ void agent_function_condition_wrapper(
     const char* d_env_buffer,
 #endif
     const unsigned int popNo,
-    util::detail::curandState *d_rng,
+    detail::curandState *d_rng,
     unsigned int *scanFlag_conditionResult) {
     // We place these at the start of shared memory, so we can locate it anywhere in device code without a reference
     using detail::sm;
diff --git a/include/flamegpu/runtime/DeviceAPI.cuh b/include/flamegpu/runtime/DeviceAPI.cuh
index ea3edb5b0..f7782f8b2 100644
--- a/include/flamegpu/runtime/DeviceAPI.cuh
+++ b/include/flamegpu/runtime/DeviceAPI.cuh
@@ -12,8 +12,8 @@
 #else
 #include "dynamic/curve_rtc_dynamic.h"
 #endif  // !_RTC
-#include "flamegpu/runtime/utility/AgentRandom.cuh"
-#include "flamegpu/runtime/utility/DeviceEnvironment.cuh"
+#include "flamegpu/runtime/random/AgentRandom.cuh"
+#include "flamegpu/runtime/environment/DeviceEnvironment.cuh"
 #include "flamegpu/runtime/AgentFunction.cuh"
 #include "flamegpu/runtime/AgentFunctionCondition.cuh"
 #include "flamegpu/defines.h"
@@ -47,14 +47,14 @@ class ReadOnlyDeviceAPI {
         const detail::curve::CurveTable *,
 #endif
         const unsigned int,
-        util::detail::curandState *,
+        detail::curandState *,
         unsigned int *);
 
  public:
     /**
      * @param d_rng Pointer to the device random state buffer to be used
      */
-    __device__ ReadOnlyDeviceAPI(util::detail::curandState *&d_rng)
+    __device__ ReadOnlyDeviceAPI(detail::curandState *&d_rng)
         : random(AgentRandom(&d_rng[getIndex()]))
         , environment(DeviceEnvironment()) { }
     /**
@@ -155,7 +155,7 @@ class DeviceAPI {
         const unsigned int,
         const void *,
         const void *,
-        util::detail::curandState *,
+        detail::curandState *,
         unsigned int *,
         unsigned int *,
         unsigned int *);
@@ -239,7 +239,7 @@ class DeviceAPI {
      */
     __device__ DeviceAPI(
         id_t *&d_agent_output_nextID,
-        util::detail::curandState *&d_rng,
+        detail::curandState *&d_rng,
         unsigned int *&scanFlag_agentOutput,
         typename MessageIn::In &&message_in,
         typename MessageOut::Out &&message_out)
diff --git a/include/flamegpu/runtime/HostAPI.h b/include/flamegpu/runtime/HostAPI.h
index 298e29aa2..b4b8db5a5 100644
--- a/include/flamegpu/runtime/HostAPI.h
+++ b/include/flamegpu/runtime/HostAPI.h
@@ -9,19 +9,21 @@
 #include <vector>
 #include <memory>
 
-#include "flamegpu/gpu/detail/CUDAErrorChecking.cuh"
-#include "flamegpu/runtime/utility/HostRandom.cuh"
-#include "flamegpu/runtime/utility/HostEnvironment.cuh"
+#include "flamegpu/simulation/detail/CUDAErrorChecking.cuh"
+#include "flamegpu/runtime/random/HostRandom.cuh"
+#include "flamegpu/runtime/environment/HostEnvironment.cuh"
 #include "flamegpu/runtime/HostAPI_macros.h"
-#include "flamegpu/runtime/HostNewAgentAPI.h"
-#include "flamegpu/util/detail/cuda.cuh"
+#include "flamegpu/runtime/agent/HostNewAgentAPI.h"
+#include "flamegpu/detail/cuda.cuh"
 
 namespace flamegpu {
-
+namespace detail {
+class CUDAFatAgent;
 class CUDAScatter;
+class CUDAMacroEnvironment;
+}  // namespace detail
 class CUDASimulation;
 class HostAgentAPI;
-class CUDAMacroEnvironment;
 
 /**
  * @brief    A flame gpu api class for use by host functions only
@@ -36,7 +38,7 @@ class HostAPI {
     /**
      * CUDAFatAgent::assignIDs() makes use of resizeTempStorage()
      */
-    friend class CUDAFatAgent;
+    friend class detail::CUDAFatAgent;
 
  public:
     // Typedefs repeated from CUDASimulation
@@ -50,14 +52,14 @@ class HostAPI {
      * Stores reference of CUDASimulation
      */
      explicit HostAPI(CUDASimulation&_agentModel,
-          RandomManager &rng,
-          CUDAScatter &scatter,
-          const AgentOffsetMap &agentOffsets,
-          AgentDataMap &agentData,
-          const std::shared_ptr<EnvironmentManager> &env,
-          CUDAMacroEnvironment &macro_env,
-          unsigned int streamId,
-         cudaStream_t stream);
+        detail::RandomManager &rng,
+        detail::CUDAScatter &scatter,
+        const AgentOffsetMap &agentOffsets,
+        AgentDataMap &agentData,
+        const std::shared_ptr<detail::EnvironmentManager> &env,
+        detail::CUDAMacroEnvironment &macro_env,
+        unsigned int streamId,
+        cudaStream_t stream);
     /**
      * Frees held device memory
      */
@@ -101,7 +103,7 @@ class HostAPI {
     /**
      * Cuda scatter singleton
      */
-    CUDAScatter &scatter;
+    detail::CUDAScatter &scatter;
     /**
      * Stream index for stream-specific resources
      */
@@ -116,7 +118,7 @@ template<typename T>
 void HostAPI::resizeOutputSpace(const unsigned int items) {
     if (sizeof(T) * items > d_output_space_size) {
         if (d_output_space_size) {
-            gpuErrchk(flamegpu::util::detail::cuda::cudaFree(d_output_space));
+            gpuErrchk(flamegpu::detail::cuda::cudaFree(d_output_space));
         }
         gpuErrchk(cudaMalloc(&d_output_space, sizeof(T) * items));
         d_output_space_size = sizeof(T) * items;
diff --git a/include/flamegpu/runtime/HostAgentAPI.cuh b/include/flamegpu/runtime/HostAgentAPI.cuh
index 73a339636..fda95dd89 100644
--- a/include/flamegpu/runtime/HostAgentAPI.cuh
+++ b/include/flamegpu/runtime/HostAgentAPI.cuh
@@ -20,16 +20,16 @@
 #include <memory>
 #include <utility>
 
-#include "flamegpu/sim/AgentInterface.h"
+#include "flamegpu/simulation/detail/AgentInterface.h"
 #include "flamegpu/model/AgentDescription.h"
 #include "flamegpu/runtime/HostAPI.h"
-#include "flamegpu/gpu/CUDASimulation.h"
-#include "flamegpu/gpu/CUDAAgent.h"
-#include "flamegpu/pop/DeviceAgentVector.h"
-#include "flamegpu/pop/DeviceAgentVector_impl.h"
-#include "flamegpu/sim/AgentLoggingConfig_Reductions.cuh"
-#include "flamegpu/sim/AgentLoggingConfig_SumReturn.h"
-#include "flamegpu/util/type_decode.h"
+#include "flamegpu/simulation/CUDASimulation.h"
+#include "flamegpu/simulation/detail/CUDAAgent.h"
+#include "flamegpu/runtime/agent/DeviceAgentVector.h"
+#include "flamegpu/runtime/agent/DeviceAgentVector_impl.h"
+#include "flamegpu/simulation/AgentLoggingConfig_Reductions.cuh"
+#include "flamegpu/simulation/AgentLoggingConfig_SumReturn.h"
+#include "flamegpu/detail/type_decode.h"
 
 namespace flamegpu {
 
diff --git a/include/flamegpu/pop/AgentInstance.h b/include/flamegpu/runtime/agent/AgentInstance.h
similarity index 80%
rename from include/flamegpu/pop/AgentInstance.h
rename to include/flamegpu/runtime/agent/AgentInstance.h
index 1807573e6..a963771c9 100644
--- a/include/flamegpu/pop/AgentInstance.h
+++ b/include/flamegpu/runtime/agent/AgentInstance.h
@@ -1,5 +1,5 @@
-#ifndef INCLUDE_FLAMEGPU_POP_AGENTINSTANCE_H_
-#define INCLUDE_FLAMEGPU_POP_AGENTINSTANCE_H_
+#ifndef INCLUDE_FLAMEGPU_RUNTIME_AGENT_AGENTINSTANCE_H_
+#define INCLUDE_FLAMEGPU_RUNTIME_AGENT_AGENTINSTANCE_H_
 
 #include <memory>
 #include <map>
@@ -8,8 +8,8 @@
 
 
 #include "flamegpu/model/AgentData.h"
-#include "flamegpu/pop/AgentVector.h"
-#include "flamegpu/util/Any.h"
+#include "flamegpu/simulation/AgentVector.h"
+#include "flamegpu/detail/Any.h"
 
 namespace flamegpu {
 
@@ -77,7 +77,7 @@ class AgentInstance {
 #endif
 
  private:
-    std::map<std::string, util::Any> _data;
+    std::map<std::string, detail::Any> _data;
     std::shared_ptr<const AgentData> _agent;
 };
 
@@ -91,16 +91,16 @@ T AgentInstance::getVariable(const std::string& variable_name) const {
             variable_name.c_str());
     }
     const auto& v_buff = v_it->second;
-    if (v_buff.elements != type_decode<T>::len_t) {
+    if (v_buff.elements != detail::type_decode<T>::len_t) {
         THROW exception::InvalidVarType("Variable '%s' is an array variable, use the array method instead, "
             "in AgentInstance::getVariable().",
             variable_name.c_str());
     }
-    if (v_buff.type != std::type_index(typeid(typename type_decode<T>::type_t))) {
+    if (v_buff.type != std::type_index(typeid(typename detail::type_decode<T>::type_t))) {
         THROW exception::InvalidVarType("Variable '%s' is of a different type. "
             "'%s' was expected, but '%s' was requested,"
             "in AgentInstance::getVariable().",
-            variable_name.c_str(), v_buff.type.name(), typeid(typename type_decode<T>::type_t).name());
+            variable_name.c_str(), v_buff.type.name(), typeid(typename detail::type_decode<T>::type_t).name());
     }
     return *static_cast<const T*>(v_buff.ptr);
 }
@@ -113,16 +113,16 @@ std::array<T, N> AgentInstance::getVariable(const std::string& variable_name) co
             variable_name.c_str());
     }
     const auto& v_buff = v_it->second;
-    if (v_buff.elements != N * type_decode<T>::len_t) {
+    if (v_buff.elements != N * detail::type_decode<T>::len_t) {
         THROW exception::InvalidVarType("Variable '%s' has '%u' elements, but an array of length %u was passed, "
             "in AgentInstance::getVariable().",
-            variable_name.c_str(), v_buff.elements / type_decode<T>::len_t, N);
+            variable_name.c_str(), v_buff.elements / detail::type_decode<T>::len_t, N);
     }
-    if (v_buff.type != std::type_index(typeid(typename type_decode<T>::type_t))) {
+    if (v_buff.type != std::type_index(typeid(typename detail::type_decode<T>::type_t))) {
         THROW exception::InvalidVarType("Variable '%s' is of a different type. "
             "'%s' was expected, but '%s' was requested,"
             "in AgentInstance::getVariable().",
-            variable_name.c_str(), v_buff.type.name(), typeid(typename type_decode<T>::type_t).name());
+            variable_name.c_str(), v_buff.type.name(), typeid(typename detail::type_decode<T>::type_t).name());
     }
     std::array<T, N> rtn;
     memcpy(rtn.data(), v_buff.ptr, sizeof(T) * N);
@@ -142,22 +142,22 @@ T AgentInstance::getVariable(const std::string& variable_name, const unsigned in
             "in AgentInstance::getVariable()\n",
             variable_name.c_str(), N, v_buff.elements);
     }
-    if (v_buff.elements % type_decode<T>::len_t != 0) {
+    if (v_buff.elements % detail::type_decode<T>::len_t != 0) {
         THROW exception::InvalidVarType("Variable array length (%u) is not visible by vector length (%u),  "
             "in AgentInstance::getVariable().",
-            v_buff.elements, type_decode<T>::len_t, variable_name.c_str());
+            v_buff.elements, detail::type_decode<T>::len_t, variable_name.c_str());
     }
-    const unsigned int t_index = type_decode<T>::len_t * index + type_decode<T>::len_t;
+    const unsigned int t_index = detail::type_decode<T>::len_t * index + detail::type_decode<T>::len_t;
     if (t_index > v_buff.elements || t_index < index) {
         THROW exception::OutOfBoundsException("Index '%u' exceeds array bounds [0-%u) of variable '%s',  "
             "in AgentInstance::getVariable().",
             index, v_buff.elements, variable_name.c_str());
     }
-    if (v_buff.type != std::type_index(typeid(typename type_decode<T>::type_t))) {
+    if (v_buff.type != std::type_index(typeid(typename detail::type_decode<T>::type_t))) {
         THROW exception::InvalidVarType("Variable '%s' is of a different type. "
             "'%s' was expected, but '%s' was requested,"
             "in AgentInstance::getVariable().",
-            variable_name.c_str(), v_buff.type.name(), typeid(typename type_decode<T>::type_t).name());
+            variable_name.c_str(), v_buff.type.name(), typeid(typename detail::type_decode<T>::type_t).name());
     }
     return static_cast<T*>(v_buff.ptr)[index];
 }
@@ -171,18 +171,18 @@ std::vector<T> AgentInstance::getVariableArray(const std::string& variable_name)
             variable_name.c_str());
     }
     const auto& v_buff = v_it->second;
-    if (v_buff.elements % type_decode<T>::len_t != 0) {
+    if (v_buff.elements % detail::type_decode<T>::len_t != 0) {
         THROW exception::InvalidVarType("Variable array length (%u) is not visible by vector length (%u),  "
             "in AgentInstance::getVariableArray().",
-            v_buff.elements, type_decode<T>::len_t, variable_name.c_str());
+            v_buff.elements, detail::type_decode<T>::len_t, variable_name.c_str());
     }
-    if (v_buff.type != std::type_index(typeid(typename type_decode<T>::type_t))) {
+    if (v_buff.type != std::type_index(typeid(typename detail::type_decode<T>::type_t))) {
         THROW exception::InvalidVarType("Variable '%s' is of a different type. "
             "'%s' was expected, but '%s' was requested,"
             "in AgentInstance::getVariableArray().",
-            variable_name.c_str(), v_buff.type.name(), typeid(typename type_decode<T>::type_t).name());
+            variable_name.c_str(), v_buff.type.name(), typeid(typename detail::type_decode<T>::type_t).name());
     }
-    std::vector<T> rtn(static_cast<size_t>(v_buff.elements / type_decode<T>::len_t));
+    std::vector<T> rtn(static_cast<size_t>(v_buff.elements / detail::type_decode<T>::len_t));
     memcpy(rtn.data(), static_cast<T*>(v_buff.ptr), sizeof(T) * v_buff.elements);
     return rtn;
 }
@@ -196,16 +196,16 @@ void AgentInstance::setVariable(const std::string& variable_name, T value) {
             variable_name.c_str());
     }
     auto& v_buff = v_it->second;
-    if (v_buff.elements != type_decode<T>::len_t) {
+    if (v_buff.elements != detail::type_decode<T>::len_t) {
         THROW exception::InvalidVarType("Variable '%s' is an array variable, use the array method instead, "
             "in AgentInstance::setVariable().",
             variable_name.c_str());
     }
-    if (v_buff.type != std::type_index(typeid(typename type_decode<T>::type_t))) {
+    if (v_buff.type != std::type_index(typeid(typename detail::type_decode<T>::type_t))) {
         THROW exception::InvalidVarType("Variable '%s' is of a different type. "
             "'%s' was expected, but '%s' was requested,"
             "in AgentInstance::setVariable().",
-            variable_name.c_str(), v_buff.type.name(), typeid(typename type_decode<T>::type_t).name());
+            variable_name.c_str(), v_buff.type.name(), typeid(typename detail::type_decode<T>::type_t).name());
     }
     // do the replace
     *static_cast<T*>(v_buff.ptr) = value;
@@ -219,16 +219,16 @@ void AgentInstance::setVariable(const std::string& variable_name, const std::arr
             variable_name.c_str());
     }
     auto& v_buff = v_it->second;
-    if (v_buff.elements != N * type_decode<T>::len_t) {
+    if (v_buff.elements != N * detail::type_decode<T>::len_t) {
         THROW exception::InvalidVarType("Variable '%s' has '%u' elements, but an array of length %u was passed, "
             "in AgentInstance::setVariable().",
             variable_name.c_str(), v_buff.elements, N);
     }
-    if (v_buff.type != std::type_index(typeid(typename type_decode<T>::type_t))) {
+    if (v_buff.type != std::type_index(typeid(typename detail::type_decode<T>::type_t))) {
         THROW exception::InvalidVarType("Variable '%s' is of a different type. "
             "'%s' was expected, but '%s' was requested,"
             "in AgentInstance::setVariable().",
-            variable_name.c_str(), v_buff.type.name(), typeid(typename type_decode<T>::type_t).name());
+            variable_name.c_str(), v_buff.type.name(), typeid(typename detail::type_decode<T>::type_t).name());
     }
     memcpy(static_cast<T*>(v_buff.ptr), value.data(), sizeof(T) * N);
 }
@@ -246,22 +246,22 @@ void AgentInstance::setVariable(const std::string& variable_name, const unsigned
             "in AgentInstance::setVariable()\n",
             variable_name.c_str(), N, v_buff.elements);
     }
-    if (v_buff.elements % type_decode<T>::len_t != 0) {
+    if (v_buff.elements % detail::type_decode<T>::len_t != 0) {
         THROW exception::InvalidVarType("Variable array length (%u) is not visible by vector length (%u),  "
             "in AgentInstance::setVariable().",
-           v_buff.elements, type_decode<T>::len_t, variable_name.c_str());
+           v_buff.elements, detail::type_decode<T>::len_t, variable_name.c_str());
     }
-    const unsigned int t_index = type_decode<T>::len_t * index + type_decode<T>::len_t;
+    const unsigned int t_index = detail::type_decode<T>::len_t * index + detail::type_decode<T>::len_t;
     if (t_index > v_buff.elements || t_index < index) {
         THROW exception::OutOfBoundsException("Index '%u' exceeds array bounds [0-%u) of variable '%s',  "
             "in AgentInstance::setVariable().",
             index, v_buff.elements, variable_name.c_str());
     }
-    if (v_buff.type != std::type_index(typeid(typename type_decode<T>::type_t))) {
+    if (v_buff.type != std::type_index(typeid(typename detail::type_decode<T>::type_t))) {
         THROW exception::InvalidVarType("Variable '%s' is of a different type. "
             "'%s' was expected, but '%s' was requested,"
             "in AgentInstance::setVariable().",
-            variable_name.c_str(), v_buff.type.name(), typeid(typename type_decode<T>::type_t).name());
+            variable_name.c_str(), v_buff.type.name(), typeid(typename detail::type_decode<T>::type_t).name());
     }
     static_cast<T*>(v_buff.ptr)[index] = value;
 }
@@ -275,16 +275,16 @@ void AgentInstance::setVariableArray(const std::string& variable_name, const std
             variable_name.c_str());
     }
     auto& v_buff = v_it->second;
-    if (v_buff.elements != value.size() * type_decode<T>::len_t) {
+    if (v_buff.elements != value.size() * detail::type_decode<T>::len_t) {
         THROW exception::InvalidVarType("Variable '%s' has '%u' elements, but an array of length %u was passed, "
             "in AgentInstance::setVariableArray().",
-            variable_name.c_str(), v_buff.elements, value.size() * type_decode<T>::len_t);
+            variable_name.c_str(), v_buff.elements, value.size() * detail::type_decode<T>::len_t);
     }
-    if (v_buff.type != std::type_index(typeid(typename type_decode<T>::type_t))) {
+    if (v_buff.type != std::type_index(typeid(typename detail::type_decode<T>::type_t))) {
         THROW exception::InvalidVarType("Variable '%s' is of a different type. "
             "'%s' was expected, but '%s' was requested,"
             "in AgentInstance::setVariableArray().",
-            variable_name.c_str(), v_buff.type.name(), typeid(typename type_decode<T>::type_t).name());
+            variable_name.c_str(), v_buff.type.name(), typeid(typename detail::type_decode<T>::type_t).name());
     }
     memcpy(static_cast<T*>(v_buff.ptr), value.data(), sizeof(T) * v_buff.elements);
 }
@@ -292,4 +292,4 @@ void AgentInstance::setVariableArray(const std::string& variable_name, const std
 
 }  // namespace flamegpu
 
-#endif  // INCLUDE_FLAMEGPU_POP_AGENTINSTANCE_H_
+#endif  // INCLUDE_FLAMEGPU_RUNTIME_AGENT_AGENTINSTANCE_H_
diff --git a/include/flamegpu/runtime/agent/DeviceAgentVector.h b/include/flamegpu/runtime/agent/DeviceAgentVector.h
new file mode 100644
index 000000000..e115caedb
--- /dev/null
+++ b/include/flamegpu/runtime/agent/DeviceAgentVector.h
@@ -0,0 +1,19 @@
+#ifndef INCLUDE_FLAMEGPU_RUNTIME_AGENT_DEVICEAGENTVECTOR_H_
+#define INCLUDE_FLAMEGPU_RUNTIME_AGENT_DEVICEAGENTVECTOR_H_
+
+#include "flamegpu/simulation/AgentVector.h"
+#include "flamegpu/runtime/agent/DeviceAgentVector_impl.h"
+
+namespace flamegpu {
+
+/**
+ * This acts as a reference to DeviceAgentVector_impl
+ * That class cannot be copied or assigned so it is accessed via a reference wrapper
+ *
+ * @see DeviceAgentVector_impl
+ */
+typedef DeviceAgentVector_impl& DeviceAgentVector;
+
+}  // namespace flamegpu
+
+#endif  // INCLUDE_FLAMEGPU_RUNTIME_AGENT_DEVICEAGENTVECTOR_H_
diff --git a/include/flamegpu/pop/DeviceAgentVector_impl.h b/include/flamegpu/runtime/agent/DeviceAgentVector_impl.h
similarity index 94%
rename from include/flamegpu/pop/DeviceAgentVector_impl.h
rename to include/flamegpu/runtime/agent/DeviceAgentVector_impl.h
index c2f82199e..91444530e 100644
--- a/include/flamegpu/pop/DeviceAgentVector_impl.h
+++ b/include/flamegpu/runtime/agent/DeviceAgentVector_impl.h
@@ -1,5 +1,5 @@
-#ifndef INCLUDE_FLAMEGPU_POP_DEVICEAGENTVECTOR_IMPL_H_
-#define INCLUDE_FLAMEGPU_POP_DEVICEAGENTVECTOR_IMPL_H_
+#ifndef INCLUDE_FLAMEGPU_RUNTIME_AGENT_DEVICEAGENTVECTOR_IMPL_H_
+#define INCLUDE_FLAMEGPU_RUNTIME_AGENT_DEVICEAGENTVECTOR_IMPL_H_
 
 #include <string>
 #include <utility>
@@ -9,14 +9,14 @@
 #include <set>
 #include <vector>
 
-#include "flamegpu/pop/AgentVector.h"
-#include "flamegpu/gpu/CUDAFatAgentStateList.h"  // VariableBuffer
+#include "flamegpu/simulation/AgentVector.h"
+#include "flamegpu/simulation/detail/CUDAFatAgentStateList.h"  // VariableBuffer
 
 namespace flamegpu {
-
+namespace detail {
 class CUDAScatter;
 class CUDAAgent;
-
+}  // detail
 struct VarOffsetStruct;
 struct NewAgentStorage;
 
@@ -43,9 +43,9 @@ class DeviceAgentVector_impl : protected AgentVector {
      * @param streamId The stream index to use for accessing stream specific resources such as scan compaction arrays and buffers
      * @param stream CUDA stream to be used for async CUDA operations
      */
-    DeviceAgentVector_impl(CUDAAgent& _cuda_agent, const std::string& cuda_agent_state,
+    DeviceAgentVector_impl(detail::CUDAAgent& _cuda_agent, const std::string& cuda_agent_state,
         const VarOffsetStruct& _agentOffsets, std::vector<NewAgentStorage>& _newAgentData,
-        CUDAScatter& scatter, unsigned int streamId, cudaStream_t stream);
+        detail::CUDAScatter& scatter, unsigned int streamId, cudaStream_t stream);
     /**
      * Copy operations are disabled
      */
@@ -284,7 +284,7 @@ class DeviceAgentVector_impl : protected AgentVector {
          /**
           * Never allocate
          */
-         explicit VariableBufferPair(const std::shared_ptr<VariableBuffer>& _device)
+         explicit VariableBufferPair(const std::shared_ptr<detail::VariableBuffer>& _device)
              : device(_device) { }
          VariableBufferPair(VariableBufferPair&& other) {
              *this = std::move(other);
@@ -312,7 +312,7 @@ class DeviceAgentVector_impl : protected AgentVector {
          */
          char* host = nullptr;
          /**/
-         std::shared_ptr<VariableBuffer> device;
+         std::shared_ptr<detail::VariableBuffer> device;
      };
     /**
      * Any operations which move agents just be applied to this buffers too
@@ -352,18 +352,18 @@ class DeviceAgentVector_impl : protected AgentVector {
      * @param init If true, new memory is init
      */
     void resizeUnboundBuffers(unsigned int new_capacity, bool init);
-    CUDAAgent& cuda_agent;
+    detail::CUDAAgent& cuda_agent;
     std::string cuda_agent_state;
 
 
     const VarOffsetStruct& agentOffsets;
     std::vector<NewAgentStorage>& newAgentData;
 
-    CUDAScatter& scatter;
+    detail::CUDAScatter& scatter;
     const unsigned int streamId;
     const cudaStream_t stream;
 };
 
 }  // namespace flamegpu
 
-#endif  // INCLUDE_FLAMEGPU_POP_DEVICEAGENTVECTOR_IMPL_H_
+#endif  // INCLUDE_FLAMEGPU_RUNTIME_AGENT_DEVICEAGENTVECTOR_IMPL_H_
diff --git a/include/flamegpu/runtime/agent/HostAgentAPI.cuh b/include/flamegpu/runtime/agent/HostAgentAPI.cuh
new file mode 100644
index 000000000..bcf853384
--- /dev/null
+++ b/include/flamegpu/runtime/agent/HostAgentAPI.cuh
@@ -0,0 +1,1017 @@
+#ifndef INCLUDE_FLAMEGPU_RUNTIME_AGENT_HOSTAGENTAPI_CUH_
+#define INCLUDE_FLAMEGPU_RUNTIME_AGENT_HOSTAGENTAPI_CUH_
+#ifdef _MSC_VER
+#pragma warning(push, 1)
+#pragma warning(disable : 4706 4834)
+#endif  // _MSC_VER
+#include <cub/cub.cuh>
+#include <thrust/count.h>
+#include <thrust/device_ptr.h>
+#include <thrust/sort.h>
+#include <thrust/execution_policy.h>
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif  // _MSC_VER
+
+#include <algorithm>
+#include <string>
+#include <vector>
+#include <functional>
+#include <memory>
+#include <utility>
+
+#include "flamegpu/simulation/detail/AgentInterface.h"
+#include "flamegpu/model/AgentDescription.h"
+#include "flamegpu/runtime/HostAPI.h"
+#include "flamegpu/simulation/CUDASimulation.h"
+#include "flamegpu/simulation/detail/CUDAAgent.h"
+#include "flamegpu/runtime/agent/DeviceAgentVector.h"
+#include "flamegpu/runtime/agent/DeviceAgentVector_impl.h"
+#include "flamegpu/simulation/AgentLoggingConfig_Reductions.cuh"
+#include "flamegpu/simulation/AgentLoggingConfig_SumReturn.h"
+#include "flamegpu/detail/type_decode.h"
+
+namespace flamegpu {
+
+/**
+ * Macro for defining custom reduction functions with the correct inputs.
+ *
+ * (a, b)->(c) 
+ *
+ * These functions must be valid CUDA code, and have no access to the FLAMEGPU DeviceAPI.
+ *
+ * Saves users from manually defining custom reductions, e.g.:
+ * @code{.cpp}
+ * // User Implemented custom reduction
+ * struct SomeCustomReduction_impl {
+ *  public:
+ *     template <typename OutT>
+ *     struct binary_function {
+ *         __device__ __forceinline__ OutT operator()(const OutT &a, const OutT &b) const {
+ *              // reduce something
+                return a + b;
+ *         }
+ *     };
+ * };
+ * SomeCustomReduction_impl SomeCustomReduction;
+ * @endcode
+ */
+#define FLAMEGPU_CUSTOM_REDUCTION(funcName, a, b)\
+struct funcName ## _impl {\
+ public:\
+    template <typename OutT>\
+    struct binary_function {\
+        __host__ __device__ __forceinline__ OutT operator()(const OutT &a, const OutT &b) const;\
+    };\
+};\
+funcName ## _impl funcName;\
+template <typename OutT>\
+__host__ __device__ __forceinline__ OutT funcName ## _impl::binary_function<OutT>::operator()(const OutT & a, const OutT & b) const
+
+ /**
+  * Macro for defining custom transform functions with the correct inputs.
+  *
+  * (a)->(b)
+  *
+  * These functions must be valid CUDA code, and have no access to the FLAMEGPU DeviceAPI.
+  *
+  * Saves users from manually defining custom transformations, e.g.:
+  * @code{.cpp}
+  * // User Implemented custom transform
+  * struct SomeCustomTransform_impl {
+  *  public:
+  *     template<typename InT, typename OutT>
+  *     struct unary_function {
+  *         __device__ __forceinline__ OutT operator()(const InT &a) const {
+  *              // transform something
+                 return a * a;
+  *         }
+  *     };
+  * };
+  * SomeCustomTransform_impl SomeCustomTransform;
+  * @endcode
+  */
+#define FLAMEGPU_CUSTOM_TRANSFORM(funcName, a)\
+struct funcName ## _impl {\
+ public:\
+    template<typename InT, typename OutT>\
+    struct unary_function {\
+        __host__ __device__ OutT operator()(const InT &a) const;\
+    };\
+};\
+funcName ## _impl funcName;\
+template<typename InT, typename OutT>\
+__device__ __forceinline__ OutT funcName ## _impl::unary_function<InT, OutT>::operator()(const InT &a) const
+
+/**
+ * Collection of HostAPI functions related to agents
+ *
+ * Mostly provides access to reductions over agent variables
+ */
+class HostAgentAPI {
+    /**
+     * Access to async sort method/s by spatialSortAgent_async()
+     */
+    friend class CUDASimulation;
+
+ public:
+   /**
+    * Construct a new HostAgentAPI instance for a specified agent type and state
+    *
+    * @param _api Parent HostAPI instance
+    * @param _agent Agent object holding the agent data
+    * @param _stateName Name of the agent state to be represented
+    * @param _agentOffsets Layout of memory within the Host Agent Birth data structure (_newAgentData)
+    * @param _newAgentData Structure containing agents birthed via Host Agent Birth
+    */
+    HostAgentAPI(HostAPI &_api, detail::AgentInterface &_agent, const std::string &_stateName, const VarOffsetStruct &_agentOffsets, HostAPI::AgentDataBuffer&_newAgentData)
+        : api(_api)
+        , agent(_agent)
+        , stateName(_stateName)
+        , agentOffsets(_agentOffsets)
+        , newAgentData(_newAgentData) { }
+    /**
+     * Copy constructor
+     * Not actually sure this is required
+     */
+    HostAgentAPI(const HostAgentAPI& other)
+        : api(other.api)
+        , agent(other.agent)
+        , stateName(other.stateName)
+        , agentOffsets(other.agentOffsets)
+        , newAgentData(other.newAgentData)
+    { }
+    /**
+     * Creates a new agent in the current agent and returns an object for configuring it's member variables
+     * 
+     * This mode of agent creation is more efficient than manipulating the vector returned by getPopulationData(),
+     * as it batches agent creation to a single scatter kernel if possible (e.g. no data dependencies).
+     */
+    HostNewAgentAPI newAgent();
+    /*
+     * Returns the number of agents in this state
+     */
+    unsigned int count();
+    /**
+     * Wraps cub::DeviceReduce::Sum()
+     * @param variable The agent variable to perform the sum reduction across
+     * @tparam InT The type of the variable as specified in the model description hierarchy
+     * @throws exception::UnsupportedVarType Array variables are not supported
+     * @throws exception::InvalidAgentVar If the agent does not contain a variable of the same name
+     * @throws exception::InvalidVarType If the passed variable type does not match that specified in the model description hierarchy
+     */
+    template<typename InT>
+    InT sum(const std::string &variable) const;
+    /**
+     * Wraps cub::DeviceReduce::Sum()
+     * @param variable The agent variable to perform the sum reduction across
+     * @tparam OutT The template arg, 'OutT' can be used if the sum is expected to exceed the representation of the type being summed
+     * @tparam InT The type of the variable as specified in the model description hierarchy
+     * @throws exception::UnsupportedVarType Array variables are not supported
+     * @throws exception::InvalidAgentVar If the agent does not contain a variable of the same name
+     * @throws exception::InvalidVarType If the passed variable type does not match that specified in the model description hierarchy
+     */
+    template<typename InT, typename OutT>
+    OutT sum(const std::string &variable) const;
+    /**
+     * Returns the mean and standard deviation of the specified variable in the agent population
+     * The return value is a pair, where the first item holds the mean and the second item the standard deviation.
+     * @param variable The agent variable to perform the sum reduction across
+     * @tparam InT The type of the variable as specified in the model description hierarchy
+     * @throws exception::InvalidAgentVar If the agent does not contain a variable of the same name
+     * @throws exception::InvalidVarType If the passed variable type does not match that specified in the model description hierarchy
+     * @note If you only require the mean, it is more efficient to use sum()/count()
+     */
+    template<typename InT>
+    std::pair<double, double> meanStandardDeviation(const std::string& variable) const;
+    /**
+     * Wraps cub::DeviceReduce::Min()
+     * @param variable The agent variable to perform the lowerBound reduction across
+     * @tparam InT The type of the variable as specified in the model description hierarchy
+     * @throws exception::UnsupportedVarType Array variables are not supported
+     * @throws exception::InvalidAgentVar If the agent does not contain a variable of the same name
+     * @throws exception::InvalidVarType If the passed variable type does not match that specified in the model description hierarchy
+     */
+    template<typename InT>
+    InT min(const std::string &variable) const;
+    /**
+     * Wraps cub::DeviceReduce::Max()
+     * @param variable The agent variable to perform the upperBound reduction across
+     * @tparam InT The type of the variable as specified in the model description hierarchy
+     * @throws exception::UnsupportedVarType Array variables are not supported
+     * @throws exception::InvalidAgentVar If the agent does not contain a variable of the same name
+     * @throws exception::InvalidVarType If the passed variable type does not match that specified in the model description hierarchy
+     */
+    template<typename InT>
+    InT max(const std::string &variable) const;
+    /**
+     * Wraps thrust::count(), to count the number of occurences of the provided value
+     * @param variable The agent variable to perform the count reduction across
+     * @param value The value to count occurences of
+     * @tparam InT The type of the variable as specified in the model description hierarchy
+     * @throws exception::UnsupportedVarType Array variables are not supported
+     * @throws exception::InvalidAgentVar If the agent does not contain a variable of the same name
+     * @throws exception::InvalidVarType If the passed variable type does not match that specified in the model description hierarchy
+     */
+    template<typename InT>
+    unsigned int count(const std::string &variable, InT value) const;
+    /**
+     * Wraps cub::DeviceHistogram::HistogramEven()
+     * @param variable The agent variable to perform the reduction across
+     * @param histogramBins The number of bins the histogram should have
+     * @param lowerBound The (inclusive) lower sample value boundary of lowest bin
+     * @param upperBound The (exclusive) upper sample value boundary of upper bin
+     * @note 2nd template arg can be used if calculation requires higher bit type to avoid overflow
+     * @tparam InT The type of the variable as specified in the model description hierarchy
+     * @throws exception::UnsupportedVarType Array variables are not supported
+     * @throws exception::InvalidAgentVar If the agent does not contain a variable of the same name
+     * @throws exception::InvalidVarType If the passed variable type does not match that specified in the model description hierarchy
+     */
+    template<typename InT>
+    std::vector<unsigned int> histogramEven(const std::string &variable, unsigned int histogramBins, InT lowerBound, InT upperBound) const;
+    template<typename InT, typename OutT>
+    std::vector<OutT> histogramEven(const std::string &variable, unsigned int histogramBins, InT lowerBound, InT upperBound) const;
+    /**
+     * Wraps cub::DeviceReduce::Reduce(), to perform a reduction with a custom operator
+     * @param variable The agent variable to perform the reduction across
+     * @param reductionOperator The custom reduction function
+     * @param init Initial value of the reduction
+     * @tparam InT The type of the variable as specified in the model description hierarchy
+     * @throws exception::UnsupportedVarType Array variables are not supported
+     * @throws exception::InvalidAgentVar If the agent does not contain a variable of the same name
+     * @throws exception::InvalidVarType If the passed variable type does not match that specified in the model description hierarchy
+     */
+    template<typename InT, typename reductionOperatorT>
+    InT reduce(const std::string &variable, reductionOperatorT reductionOperator, InT init) const;
+    /**
+     * Wraps thrust::transformReduce(), to perform a custom transform on values before performing a custom reduction
+     * @param variable The agent variable to perform the reduction across
+     * @param transformOperator The custom unary transform function
+     * @param reductionOperator The custom binary reduction function
+     * @param init Initial value of the reduction
+     * @tparam InT The type of the variable as specified in the model description hierarchy
+     * @throws exception::UnsupportedVarType Array variables are not supported
+     * @throws exception::InvalidAgentVar If the agent does not contain a variable of the same name
+     * @throws exception::InvalidVarType If the passed variable type does not match that specified in the model description hierarchy
+     */
+    template<typename InT, typename OutT, typename transformOperatorT, typename reductionOperatorT>
+    OutT transformReduce(const std::string &variable, transformOperatorT transformOperator, reductionOperatorT reductionOperator, OutT init) const;
+    /**
+     * Sort ordering
+     * Ascending or Descending
+     */
+    enum Order {Asc, Desc};
+    /**
+     * Sorts agents according to the named variable
+     * @param variable The agent variable to sort the agents according to
+     * @param order Whether the agents should be sorted in ascending or descending order of the variable
+     * @param beginBit Advanced Option, see note
+     * @param endBit Advanced Option, see note
+     * @tparam VarT The type of the variable as specified in the model description hierarchy
+     * @throws exception::UnsupportedVarType Array variables are not supported
+     * @throws exception::InvalidAgentVar If the agent does not contain a variable of the same name
+     * @throws exception::InvalidVarType If the passed variable type does not match that specified in the model description hierarchy
+     * @note An optional bit subrange [begin_bit, end_bit) of differentiating variable bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * @note The sort provides no guarantee of stability
+     */
+    template<typename VarT>
+    void sort(const std::string &variable, Order order, int beginBit = 0, int endBit = sizeof(VarT)*8);
+    /**
+     * Sort agents according to two variables e.g. [1:c, 3:b, 1:b, 1:a] -> [1:a, 1:b, 1:c, 3:b]
+     * @param variable1 This variable will be the main direction that agents are sorted
+     * @param order1 The order that variable 1 should be sorted according to
+     * @param variable2 Agents with equal variable1's, will be sorted according this this variable
+     * @param order2 The order that variable 2 should be sorted according to
+     * @throws exception::UnsupportedVarType Array variables are not supported
+     * @tparam Var1T The type of variable1 as specified in the model description hierarchy
+     * @tparam Var2T The type of variable2 as specified in the model description hierarchy
+     * @throws exception::InvalidAgentVar If the agent does not contain a variable of the same name
+     * @throws exception::InvalidVarType If the passed variable type does not match that specified in the model description hierarchy
+     */
+    template<typename Var1T, typename Var2T>
+    void sort(const std::string &variable1, Order order1, const std::string &variable2, Order order2);
+    /**
+     * Downloads the current agent state from device into an AgentVector which is returned
+     *
+     * This function is considered expensive, as it triggers a high number of host-device memory transfers.
+     * It should be used as a last resort
+     */
+    DeviceAgentVector getPopulationData();
+
+ private:
+    /**
+     * Fills the provided device buffer with consecutive integers
+     * @param d_buffer Device pointer to buffer to be filled
+     * @param length Length of the buffer (how many unsigned ints can it hold)
+     * @param stream CUDA stream to be used for async CUDA operations
+     */
+    static void fillTIDArray_async(unsigned int *d_buffer, unsigned int length, cudaStream_t stream);
+    /**
+     * Sorts a buffer by the positions array, used for multi variable agent sorts
+     * @param dest Device pointer to buffer for sorted data to be placed
+     * @param src Device pointer to buffer to be sorted
+     * @param position Positions buffer
+     * @param typeLen sizeof the type stored in the buffer (e.g. sizeof(int))
+     * @param length Length of the buffer (how many items it can it hold)
+     * @param stream CUDA stream to be used for async CUDA operations
+     */
+    static void sortBuffer_async(void *dest, void*src, unsigned int *position, size_t typeLen, unsigned int length, cudaStream_t stream);
+    /**
+     * Wraps cub::DeviceReduce::Sum()
+     * @param variable The agent variable to perform the sum reduction across
+     * @param result Variable which will store the result (note method is async, result may not arrive until stream is synchronised)
+     * @param stream The CUDAStream to use for CUDA operations
+     * @param streamId Index of stream specific structures used
+     * @tparam OutT The template arg, 'OutT' can be used if the sum is expected to exceed the representation of the type being summed
+     * @tparam InT The type of the variable as specified in the model description hierarchy
+     * @throws exception::UnsupportedVarType Array variables are not supported
+     * @throws exception::InvalidAgentVar If the agent does not contain a variable of the same name
+     * @throws exception::InvalidVarType If the passed variable type does not match that specified in the model description hierarchy
+     * @note Method is async, result may not arrive until stream is synchronised
+     */
+    template<typename InT, typename OutT>
+    void sum_async(const std::string& variable, OutT& result, cudaStream_t stream, unsigned int streamId) const;
+    /**
+     * Returns the mean and standard deviation of the specified variable in the agent population
+     * The return value is a pair, where the first item holds the mean and the second item the standard deviation.
+     * @param variable The agent variable to perform the sum reduction across
+     * @param result Variable which will store the result
+     * @param stream The CUDAStream to use for CUDA operations
+     * @param streamId Index of stream specific structures used
+     * @tparam InT The type of the variable as specified in the model description hierarchy
+     * @throws exception::InvalidAgentVar If the agent does not contain a variable of the same name
+     * @throws exception::InvalidVarType If the passed variable type does not match that specified in the model description hierarchy
+     * @note If you only require the mean, it is more efficient to use sum()/count()
+     * @note Not actually async, would need a big rewrite (and to stop using the shared device symbol?)
+     */
+    template<typename InT>
+    void meanStandardDeviation_async(const std::string& variable, std::pair<double, double>& result, cudaStream_t stream, unsigned int streamId) const;
+    /**
+     * Wraps cub::DeviceReduce::Min()
+     * @param variable The agent variable to perform the lowerBound reduction across
+     * @param result Variable which will store the result (note method is async, result may not arrive until stream is synchronised)
+     * @param stream The CUDAStream to use for CUDA operations
+     * @param streamId Index of stream specific structures used
+     * @tparam InT The type of the variable as specified in the model description hierarchy
+     * @throws exception::UnsupportedVarType Array variables are not supported
+     * @throws exception::InvalidAgentVar If the agent does not contain a variable of the same name
+     * @throws exception::InvalidVarType If the passed variable type does not match that specified in the model description hierarchy
+     * @note Method is async, result may not arrive until stream is synchronised
+     */
+    template<typename InT>
+    void min_async(const std::string& variable, InT& result, cudaStream_t stream, unsigned int streamId) const;
+    /**
+     * Wraps cub::DeviceReduce::Max()
+     * @param variable The agent variable to perform the upperBound reduction across
+     * @param result Variable which will store the result (note method is async, result may not arrive until stream is synchronised)
+     * @param stream The CUDAStream to use for CUDA operations
+     * @param streamId Index of stream specific structures used
+     * @tparam InT The type of the variable as specified in the model description hierarchy
+     * @throws exception::UnsupportedVarType Array variables are not supported
+     * @throws exception::InvalidAgentVar If the agent does not contain a variable of the same name
+     * @throws exception::InvalidVarType If the passed variable type does not match that specified in the model description hierarchy
+     * @note Method is async, result may not arrive until stream is synchronised
+     */
+    template<typename InT>
+    void max_async(const std::string& variable, InT& result, cudaStream_t stream, unsigned int streamId) const;
+    /**
+     * Wraps thrust::count(), to count the number of occurences of the provided value
+     * @param variable The agent variable to perform the count reduction across
+     * @param value The value to count occurrences of
+     * @param stream The CUDAStream to use for CUDA operations
+     * @tparam InT The type of the variable as specified in the model description hierarchy
+     * @throws exception::UnsupportedVarType Array variables are not supported
+     * @throws exception::InvalidAgentVar If the agent does not contain a variable of the same name
+     * @throws exception::InvalidVarType If the passed variable type does not match that specified in the model description hierarchy
+     * @note Not actually async, uses thrust method that doesn't support async, uses specified stream though
+     */
+    template<typename InT>
+    unsigned int count_async(const std::string& variable, InT value, cudaStream_t stream) const;
+    /**
+     * Wraps cub::DeviceHistogram::HistogramEven()
+     * @param variable The agent variable to perform the reduction across
+     * @param histogramBins The number of bins the histogram should have
+     * @param lowerBound The (inclusive) lower sample value boundary of lowest bin
+     * @param upperBound The (exclusive) upper sample value boundary of upper bin
+     * @param result Variable which will store the result (note method is async, result may not arrive until stream is synchronised)
+     * @param stream The CUDAStream to use for CUDA operations
+     * @param streamId Index of stream specific structures used
+     * @tparam InT The type of the variable as specified in the model description hierarchy
+     * @tparam OutT The type of the histogram bin variables
+     * @throws exception::UnsupportedVarType Array variables are not supported
+     * @throws exception::InvalidAgentVar If the agent does not contain a variable of the same name
+     * @throws exception::InvalidVarType If the passed variable type does not match that specified in the model description hierarchy
+     * @note Method is async, result may not arrive until stream is synchronised
+     */
+    template<typename InT, typename OutT>
+    void histogramEven_async(const std::string& variable, unsigned int histogramBins, InT lowerBound, InT upperBound, std::vector<OutT> &result, cudaStream_t stream, unsigned int streamId) const;
+    /**
+     * Wraps cub::DeviceReduce::Reduce(), to perform a reduction with a custom operator
+     * @param variable The agent variable to perform the reduction across
+     * @param reductionOperator The custom reduction function
+     * @param init Initial value of the reduction
+     * @param result Variable which will store the result (note method is async, result may not arrive until stream is synchronised)
+     * @param stream The CUDAStream to use for CUDA operations
+     * @param streamId Index of stream specific structures used
+     * @tparam InT The type of the variable as specified in the model description hierarchy
+     * @throws exception::UnsupportedVarType Array variables are not supported
+     * @throws exception::InvalidAgentVar If the agent does not contain a variable of the same name
+     * @throws exception::InvalidVarType If the passed variable type does not match that specified in the model description hierarchy
+     * @note Method is async, result may not arrive until stream is synchronised
+     */
+    template<typename InT, typename reductionOperatorT>
+    void reduce_async(const std::string& variable, reductionOperatorT reductionOperator, InT init, InT &result, cudaStream_t stream, unsigned int streamId) const;
+    /**
+     * Wraps thrust::transformReduce(), to perform a custom transform on values before performing a custom reduction
+     * @param variable The agent variable to perform the reduction across
+     * @param transformOperator The custom unary transform function
+     * @param reductionOperator The custom binary reduction function
+     * @param init Initial value of the reduction
+     * @tparam InT The type of the variable as specified in the model description hierarchy
+     * @param stream The CUDAStream to use for CUDA operations
+     * @throws exception::UnsupportedVarType Array variables are not supported
+     * @throws exception::InvalidAgentVar If the agent does not contain a variable of the same name
+     * @throws exception::InvalidVarType If the passed variable type does not match that specified in the model description hierarchy
+     * @note Not actually async, uses thrust method that doesn't support async, uses specified stream though
+     */
+    template<typename InT, typename OutT, typename transformOperatorT, typename reductionOperatorT>
+    OutT transformReduce_async(const std::string& variable, transformOperatorT transformOperator, reductionOperatorT reductionOperator, OutT init, cudaStream_t stream) const;
+    /**
+     * Sorts agents according to the named variable
+     * @param variable The agent variable to sort the agents according to
+     * @param order Whether the agents should be sorted in ascending or descending order of the variable
+     * @param beginBit Advanced Option, see note
+     * @param endBit Advanced Option, see note
+     * @param stream The CUDAStream to use for CUDA operations
+     * @param streamId The index of the stream resources to use
+     * @tparam VarT The type of the variable as specified in the model description hierarchy
+     * @throws exception::UnsupportedVarType Array variables are not supported
+     * @throws exception::InvalidAgentVar If the agent does not contain a variable of the same name
+     * @throws exception::InvalidVarType If the passed variable type does not match that specified in the model description hierarchy
+     * @note An optional bit subrange [begin_bit, end_bit) of differentiating variable bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * @note The sort provides no guarantee of stability
+     */
+    template<typename VarT>
+    void sort_async(const std::string& variable, Order order, int beginBit, int endBit, cudaStream_t stream, unsigned int streamId);
+    /**
+     * Sort agents according to two variables e.g. [1:c, 3:b, 1:b, 1:a] -> [1:a, 1:b, 1:c, 3:b]
+     * @param variable1 This variable will be the main direction that agents are sorted
+     * @param order1 The order that variable 1 should be sorted according to
+     * @param variable2 Agents with equal variable1's, will be sorted according this this variable
+     * @param order2 The order that variable 2 should be sorted according to
+     * @param stream The CUDAStream to use for CUDA operations
+     * @param streamId The index of the stream resources to use
+     * @throws exception::UnsupportedVarType Array variables are not supported
+     * @tparam Var1T The type of variable1 as specified in the model description hierarchy
+     * @tparam Var2T The type of variable2 as specified in the model description hierarchy
+     * @throws exception::InvalidAgentVar If the agent does not contain a variable of the same name
+     * @throws exception::InvalidVarType If the passed variable type does not match that specified in the model description hierarchy
+     * @note Not actually async, uses thrust method that doesn't support async, uses specified stream though
+     */
+    template<typename Var1T, typename Var2T>
+    void sort_async(const std::string& variable1, Order order1, const std::string& variable2, Order order2, cudaStream_t stream, unsigned int streamId);
+    /**
+     * Parent HostAPI
+     */
+    HostAPI &api;
+    /**
+     * Main object containing agent data
+     * Probably type CUDAAgent
+     */
+    detail::AgentInterface &agent;
+    /**
+     * Agent state being accessed
+     */
+    const std::string stateName;
+    /**
+     * Holds offsets for accessing newAgentData
+     * @see newAgent()
+     */
+    const VarOffsetStruct& agentOffsets;
+    /**
+     * Compact data store for efficient host agent creation
+     * @see newAgent()
+     */
+    HostAPI::AgentDataBuffer& newAgentData;
+};
+
+//
+// Implementation
+//
+
+template<typename InT>
+InT HostAgentAPI::sum(const std::string &variable) const {
+    InT rtn;
+    sum_async<InT, InT>(variable, rtn, this->api.stream, this->api.streamId);
+    gpuErrchk(cudaStreamSynchronize(this->api.stream));
+    return rtn;
+}
+template<typename InT, typename OutT>
+OutT HostAgentAPI::sum(const std::string& variable) const {
+    OutT rtn;
+    sum_async<InT, OutT>(variable, rtn, this->api.stream, this->api.streamId);
+    gpuErrchk(cudaStreamSynchronize(this->api.stream));
+    return rtn;
+}
+template<typename InT, typename OutT>
+void HostAgentAPI::sum_async(const std::string &variable, OutT &result, const cudaStream_t stream, const unsigned int streamId) const {
+    static_assert(sizeof(InT) <= sizeof(OutT), "Template arg OutT should not be of a smaller size than InT");
+    std::shared_ptr<DeviceAgentVector_impl> population = agent.getPopulationVec(stateName);
+    if (population) {
+        if (this->api.stream != stream) {
+            THROW exception::InvalidOperation("Attempting to sync DeviceAgentVector with wrong stream!\nThis should not be possible.\n");
+        }
+        // If the user has a DeviceAgentVector out, sync changes
+        population->syncChanges();
+    }
+    const CAgentDescription agentDesc(agent.getAgentDescription());
+    std::type_index typ = agentDesc.getVariableType(variable);  // This will throw name exception
+    if (agentDesc.getVariableLength(variable) != 1) {
+        THROW exception::UnsupportedVarType("HostAgentAPI::sum() does not support agent array variables.");
+    }
+    if (std::type_index(typeid(InT)) != typ) {
+        THROW exception::InvalidVarType("Wrong variable type passed to HostAgentAPI::sum(). "
+            "This call expects '%s', but '%s' was requested.",
+            agentDesc.getVariableType(variable).name(), typeid(InT).name());
+    }
+    void *var_ptr = agent.getStateVariablePtr(stateName, variable);
+    const auto agentCount = agent.getStateSize(stateName);
+    // Check if we need to resize cub storage
+    auto &cub_temp = api.scatter.CubTemp(streamId);
+    size_t tempByte = 0;
+    gpuErrchk(cub::DeviceReduce::Sum(nullptr, tempByte, reinterpret_cast<InT*>(var_ptr), reinterpret_cast<OutT*>(api.d_output_space), static_cast<int>(agentCount), stream));
+    cub_temp.resize(tempByte);
+    // Resize output storage
+    api.resizeOutputSpace<OutT>();
+    gpuErrchk(cub::DeviceReduce::Sum(cub_temp.getPtr(), cub_temp.getSize(), reinterpret_cast<InT*>(var_ptr), reinterpret_cast<OutT*>(api.d_output_space), static_cast<int>(agentCount), stream));
+    gpuErrchkLaunch();
+    gpuErrchk(cudaMemcpyAsync(&result, api.d_output_space, sizeof(OutT), cudaMemcpyDeviceToHost, stream));
+}
+template<typename InT>
+std::pair<double, double> HostAgentAPI::meanStandardDeviation(const std::string& variable) const {
+    std::pair<double, double> rtn;
+    meanStandardDeviation_async<InT>(variable, rtn, this->api.stream, this->api.streamId);
+    gpuErrchk(cudaStreamSynchronize(this->api.stream));  // Redundant, meanStandardDeviation_async() is not truly async
+    return rtn;
+}
+template<typename InT>
+void HostAgentAPI::meanStandardDeviation_async(const std::string& variable, std::pair<double, double> &result, const cudaStream_t stream, const unsigned int streamId) const {
+    std::shared_ptr<DeviceAgentVector_impl> population = agent.getPopulationVec(stateName);
+    if (population) {
+        if (this->api.stream != stream) {
+            THROW exception::InvalidOperation("Attempting to sync DeviceAgentVector with wrong stream!\nThis should not be possible.\n");
+        }
+        // If the user has a DeviceAgentVector out, sync changes
+        population->syncChanges();
+    }
+    const CAgentDescription agentDesc(agent.getAgentDescription());
+    std::type_index typ = agentDesc.getVariableType(variable);  // This will throw name exception
+    if (agentDesc.getVariableLength(variable) != 1) {
+        THROW exception::UnsupportedVarType("HostAgentAPI::meanStandardDeviation() does not support agent array variables.");
+    }
+    if (std::type_index(typeid(InT)) != typ) {
+        THROW exception::InvalidVarType("Wrong variable type passed to HostAgentAPI::meanStandardDeviation(). "
+            "This call expects '%s', but '%s' was requested.",
+            agentDesc.getVariableType(variable).name(), typeid(InT).name());
+    }
+    const auto agentCount = agent.getStateSize(stateName);
+    if (agentCount == 0) {
+        result = std::make_pair(0.0, 0.0);
+    }
+    // Calculate mean (We could make this more efficient by leaving sum in device mem?)
+    typename sum_input_t<InT>::result_t sum_result;
+    sum_async<InT, typename sum_input_t<InT>::result_t>(variable, sum_result, stream, streamId);
+    gpuErrchk(cudaStreamSynchronize(stream));
+    const double mean = sum_result / static_cast<double>(agentCount);
+    // Then for each number: subtract the Mean and square the result
+    // Then work out the mean of those squared differences.
+    auto lock = std::unique_lock<std::mutex>(detail::STANDARD_DEVIATION_MEAN_mutex);
+    gpuErrchk(cudaMemcpyToSymbolAsync(detail::STANDARD_DEVIATION_MEAN, &mean, sizeof(double), 0, cudaMemcpyHostToDevice, stream));
+    const double variance = transformReduce_async<InT, double>(variable, detail::standard_deviation_subtract_mean, detail::standard_deviation_add, 0, stream) / static_cast<double>(agentCount);
+    lock.unlock();
+    // Take the square root of that and we are done!
+    result = std::make_pair(mean, sqrt(variance));
+}
+template<typename InT>
+InT HostAgentAPI::min(const std::string& variable) const {
+    InT rtn;
+    min_async<InT>(variable, rtn, this->api.stream, this->api.streamId);
+    gpuErrchk(cudaStreamSynchronize(this->api.stream));
+    return rtn;
+}
+template<typename InT>
+void HostAgentAPI::min_async(const std::string &variable, InT& result, const cudaStream_t stream, const unsigned int streamId) const {
+    std::shared_ptr<DeviceAgentVector_impl> population = agent.getPopulationVec(stateName);
+    if (population) {
+        if (this->api.stream != stream) {
+            THROW exception::InvalidOperation("Attempting to sync DeviceAgentVector with wrong stream!\nThis should not be possible.\n");
+        }
+        // If the user has a DeviceAgentVector out, sync changes
+        population->syncChanges();
+    }
+    const CAgentDescription agentDesc(agent.getAgentDescription());
+    std::type_index typ = agentDesc.getVariableType(variable);  // This will throw name exception
+    if (agentDesc.getVariableLength(variable) != 1) {
+        THROW exception::UnsupportedVarType("HostAgentAPI::lowerBound() does not support agent array variables.");
+    }
+    if (std::type_index(typeid(InT)) != typ) {
+        THROW exception::InvalidVarType("Wrong variable type passed to HostAgentAPI::min(). "
+            "This call expects '%s', but '%s' was requested.",
+            agentDesc.getVariableType(variable).name(), typeid(InT).name());
+    }
+    void *var_ptr = agent.getStateVariablePtr(stateName, variable);
+    const auto agentCount = agent.getStateSize(stateName);
+    // Check if we need to resize cub storage
+    auto& cub_temp = api.scatter.CubTemp(streamId);
+    // Resize cub storage
+    size_t tempByte = 0;
+    gpuErrchk(cub::DeviceReduce::Min(nullptr, tempByte, reinterpret_cast<InT*>(var_ptr), reinterpret_cast<InT*>(api.d_output_space), static_cast<int>(agentCount), stream));
+    gpuErrchkLaunch();
+    cub_temp.resize(tempByte);
+    // Resize output storage
+    api.resizeOutputSpace<InT>();
+    gpuErrchk(cub::DeviceReduce::Min(cub_temp.getPtr(), cub_temp.getSize(), reinterpret_cast<InT*>(var_ptr), reinterpret_cast<InT*>(api.d_output_space), static_cast<int>(agentCount), stream));
+    gpuErrchkLaunch();
+    gpuErrchk(cudaMemcpyAsync(&result, api.d_output_space, sizeof(InT), cudaMemcpyDeviceToHost, stream));
+}
+template<typename InT>
+InT HostAgentAPI::max(const std::string& variable) const {
+    InT rtn;
+    max_async<InT>(variable, rtn, this->api.stream, this->api.streamId);
+    gpuErrchk(cudaStreamSynchronize(this->api.stream));
+    return rtn;
+}
+template<typename InT>
+void HostAgentAPI::max_async(const std::string &variable, InT &result, const cudaStream_t stream, const unsigned int streamId) const {
+    std::shared_ptr<DeviceAgentVector_impl> population = agent.getPopulationVec(stateName);
+    if (population) {
+        if (this->api.stream != stream) {
+            THROW exception::InvalidOperation("Attempting to sync DeviceAgentVector with wrong stream!\nThis should not be possible.\n");
+        }
+        // If the user has a DeviceAgentVector out, sync changes
+        population->syncChanges();
+    }
+    const CAgentDescription agentDesc(agent.getAgentDescription());
+    std::type_index typ = agentDesc.getVariableType(variable);  // This will throw name exception
+    if (agentDesc.getVariableLength(variable) != 1) {
+        THROW exception::UnsupportedVarType("HostAgentAPI::max() does not support agent array variables.");
+    }
+    if (std::type_index(typeid(InT)) != typ) {
+        THROW exception::InvalidVarType("Wrong variable type passed to HostAgentAPI::max(). "
+            "This call expects '%s', but '%s' was requested.",
+            agentDesc.getVariableType(variable).name(), typeid(InT).name());
+    }
+    void *var_ptr = agent.getStateVariablePtr(stateName, variable);
+    const auto agentCount = agent.getStateSize(stateName);
+    // Check if we need to resize cub storage
+    auto& cub_temp = api.scatter.CubTemp(streamId);
+    // Resize cub storage
+    size_t tempByte = 0;
+    gpuErrchk(cub::DeviceReduce::Max(nullptr, tempByte, reinterpret_cast<InT*>(var_ptr), reinterpret_cast<InT*>(api.d_output_space), static_cast<int>(agentCount), stream));
+    cub_temp.resize(tempByte);
+    // Resize output storage
+    api.resizeOutputSpace<InT>();
+    gpuErrchk(cub::DeviceReduce::Max(cub_temp.getPtr(), cub_temp.getSize(), reinterpret_cast<InT*>(var_ptr), reinterpret_cast<InT*>(api.d_output_space), static_cast<int>(agentCount), stream));
+    gpuErrchkLaunch();
+    gpuErrchk(cudaMemcpyAsync(&result, api.d_output_space, sizeof(InT), cudaMemcpyDeviceToHost, stream));
+}
+template<typename InT>
+unsigned int HostAgentAPI::count(const std::string &variable, InT value) const {
+    return count_async<InT>(variable, value, this->api.stream);
+}
+template<typename InT>
+unsigned int HostAgentAPI::count_async(const std::string& variable, InT value, const cudaStream_t stream) const {
+    std::shared_ptr<DeviceAgentVector_impl> population = agent.getPopulationVec(stateName);
+    if (population) {
+        if (this->api.stream != stream) {
+            THROW exception::InvalidOperation("Attempting to sync DeviceAgentVector with wrong stream!\nThis should not be possible.\n");
+        }
+        // If the user has a DeviceAgentVector out, sync changes
+        population->syncChanges();
+    }
+    const CAgentDescription agentDesc(agent.getAgentDescription());
+    std::type_index typ = agentDesc.getVariableType(variable);  // This will throw name exception
+    if (agentDesc.getVariableLength(variable) != 1) {
+        THROW exception::UnsupportedVarType("HostAgentAPI::count() does not support agent array variables.");
+    }
+    if (std::type_index(typeid(InT)) != typ) {
+        THROW exception::InvalidVarType("Wrong variable type passed to HostAgentAPI::count(). "
+            "This call expects '%s', but '%s' was requested.",
+            agentDesc.getVariableType(variable).name(), typeid(InT).name());
+    }
+    void *var_ptr = agent.getStateVariablePtr(stateName, variable);
+    const auto agentCount = agent.getStateSize(stateName);
+    // Cast return from ptrdiff_t (int64_t) to (uint32_t)
+    unsigned int rtn = static_cast<unsigned int>(thrust::count(thrust::cuda::par.on(stream), thrust::device_ptr<InT>(reinterpret_cast<InT*>(var_ptr)), thrust::device_ptr<InT>(reinterpret_cast<InT*>(var_ptr) + agentCount), value));
+    gpuErrchkLaunch();
+    return rtn;
+}
+template<typename InT>
+std::vector<unsigned int> HostAgentAPI::histogramEven(const std::string &variable, unsigned int histogramBins, InT lowerBound, InT upperBound) const {
+    std::vector<unsigned int> rtn;
+    histogramEven_async<InT, unsigned int>(variable, histogramBins, lowerBound, upperBound, rtn, this->api.stream, this->api.streamId);
+    gpuErrchk(cudaStreamSynchronize(this->api.stream));
+    return rtn;
+}
+template<typename InT, typename OutT>
+std::vector<OutT> HostAgentAPI::histogramEven(const std::string &variable, unsigned int histogramBins, InT lowerBound, InT upperBound) const {
+    std::vector<OutT> rtn;
+    histogramEven_async<InT, OutT>(variable, histogramBins, lowerBound, upperBound, rtn, this->api.stream, this->api.streamId);
+    gpuErrchk(cudaStreamSynchronize(this->api.stream));
+    return rtn;
+}
+template<typename InT, typename OutT>
+void HostAgentAPI::histogramEven_async(const std::string &variable, unsigned int histogramBins, InT lowerBound, InT upperBound, std::vector<OutT>& result, const cudaStream_t stream, const unsigned int streamId) const {
+    std::shared_ptr<DeviceAgentVector_impl> population = agent.getPopulationVec(stateName);
+    if (population) {
+        if (this->api.stream != stream) {
+            THROW exception::InvalidOperation("Attempting to sync DeviceAgentVector with wrong stream!\nThis should not be possible.\n");
+        }
+        // If the user has a DeviceAgentVector out, sync changes
+        population->syncChanges();
+    }
+    if (lowerBound >= upperBound) {
+        THROW exception::InvalidArgument("lowerBound (%s) must be lower than < upperBound (%s) in HostAgentAPI::histogramEven().",
+            std::to_string(lowerBound).c_str(), std::to_string(upperBound).c_str());
+    }
+    const CAgentDescription agentDesc(agent.getAgentDescription());
+    std::type_index typ = agentDesc.getVariableType(variable);  // This will throw name exception
+    if (agentDesc.getVariableLength(variable) != 1) {
+        THROW exception::UnsupportedVarType("HostAgentAPI::histogramEven() does not support agent array variables.");
+    }
+    if (std::type_index(typeid(InT)) != typ) {
+        THROW exception::InvalidVarType("Wrong variable type passed to HostAgentAPI::histogramEven(). "
+            "This call expects '%s', but '%s' was requested.",
+            agentDesc.getVariableType(variable).name(), typeid(InT).name());
+    }
+    void *var_ptr = agent.getStateVariablePtr(stateName, variable);
+    const auto agentCount = agent.getStateSize(stateName);
+    // Check if we need to resize cub storage
+    auto& cub_temp = api.scatter.CubTemp(streamId);
+    // Resize cub storage
+    size_t tempByte = 0;
+    gpuErrchk(cub::DeviceHistogram::HistogramEven(nullptr, tempByte,
+        reinterpret_cast<InT*>(var_ptr), reinterpret_cast<int*>(api.d_output_space), histogramBins + 1, lowerBound, upperBound, static_cast<int>(agentCount), stream));
+    gpuErrchkLaunch();
+    cub_temp.resize(tempByte);
+    // Resize output storage
+    api.resizeOutputSpace<OutT>(histogramBins);
+    gpuErrchk(cub::DeviceHistogram::HistogramEven(cub_temp.getPtr(), cub_temp.getSize(),
+        reinterpret_cast<InT*>(var_ptr), reinterpret_cast<OutT*>(api.d_output_space), histogramBins + 1, lowerBound, upperBound, static_cast<int>(agentCount), stream));
+    gpuErrchkLaunch();
+    result.resize(histogramBins);
+    gpuErrchk(cudaMemcpyAsync(result.data(), api.d_output_space, histogramBins * sizeof(OutT), cudaMemcpyDeviceToHost, stream));
+}
+template<typename InT, typename reductionOperatorT>
+InT HostAgentAPI::reduce(const std::string &variable, reductionOperatorT reductionOperator, InT init) const {
+    InT rtn;
+    reduce_async<InT, reductionOperatorT>(variable, reductionOperator, init, rtn, this->api.stream, this->api.streamId);
+    gpuErrchk(cudaStreamSynchronize(this->api.stream));
+    return rtn;
+}
+template<typename InT, typename reductionOperatorT>
+void HostAgentAPI::reduce_async(const std::string & variable, reductionOperatorT /*reductionOperator*/, InT init, InT &result, const cudaStream_t stream, const unsigned int streamId) const {
+    std::shared_ptr<DeviceAgentVector_impl> population = agent.getPopulationVec(stateName);
+    if (population) {
+        if (this->api.stream != stream) {
+            THROW exception::InvalidOperation("Attempting to sync DeviceAgentVector with wrong stream!\nThis should not be possible.\n");
+        }
+        // If the user has a DeviceAgentVector out, sync changes
+        population->syncChanges();
+    }
+    const CAgentDescription agentDesc(agent.getAgentDescription());
+    std::type_index typ = agentDesc.getVariableType(variable);  // This will throw name exception
+    if (agentDesc.getVariableLength(variable) != detail::type_decode<InT>::len_t) {
+        THROW exception::UnsupportedVarType("HostAgentAPI::reduce() does not support agent array variables.");
+    }
+    if (std::type_index(typeid(typename detail::type_decode<InT>::type_t)) != typ) {
+        THROW exception::InvalidVarType("Wrong variable type passed to HostAgentAPI::reduce(). "
+            "This call expects '%s', but '%s' was requested.",
+            typ.name(), typeid(typename detail::type_decode<InT>::type_t).name());
+    }
+    void *var_ptr = agent.getStateVariablePtr(stateName, variable);
+    const auto agentCount = agent.getStateSize(stateName);
+    // Check if we need to resize cub storage
+    auto& cub_temp = api.scatter.CubTemp(streamId);
+    // Resize cub storage
+    size_t tempByte = 0;
+    gpuErrchk(cub::DeviceReduce::Reduce(nullptr, tempByte, reinterpret_cast<InT*>(var_ptr), reinterpret_cast<InT*>(api.d_output_space),
+        static_cast<int>(agentCount), typename reductionOperatorT::template binary_function<InT>(), init, stream));
+    gpuErrchkLaunch();
+    cub_temp.resize(tempByte);
+    // Resize output storage
+    api.resizeOutputSpace<InT>();
+    gpuErrchk(cub::DeviceReduce::Reduce(cub_temp.getPtr(), cub_temp.getSize(), reinterpret_cast<InT*>(var_ptr), reinterpret_cast<InT*>(api.d_output_space),
+        static_cast<int>(agentCount), typename reductionOperatorT::template binary_function<InT>(), init, stream));
+    gpuErrchkLaunch();
+    gpuErrchk(cudaMemcpyAsync(&result, api.d_output_space, sizeof(InT), cudaMemcpyDeviceToHost, stream));
+}
+template<typename InT, typename OutT, typename transformOperatorT, typename reductionOperatorT>
+OutT HostAgentAPI::transformReduce(const std::string &variable, transformOperatorT transformOperator, reductionOperatorT reductionOperator, OutT init) const {
+    return transformReduce_async<InT, OutT, transformOperatorT, reductionOperatorT>(variable, transformOperator, reductionOperator, init, this->api.stream);
+}
+template<typename InT, typename OutT, typename transformOperatorT, typename reductionOperatorT>
+OutT HostAgentAPI::transformReduce_async(const std::string &variable, transformOperatorT /*transformOperator*/, reductionOperatorT /*reductionOperator*/, OutT init, cudaStream_t stream) const {
+    std::shared_ptr<DeviceAgentVector_impl> population = agent.getPopulationVec(stateName);
+    if (population) {
+        if (this->api.stream != stream) {
+            THROW exception::InvalidOperation("Attempting to sync DeviceAgentVector with wrong stream!\nThis should not be possible.\n");
+        }
+        // If the user has a DeviceAgentVector out, sync changes
+        population->syncChanges();
+    }
+    const CAgentDescription agentDesc(agent.getAgentDescription());
+    std::type_index typ = agentDesc.getVariableType(variable);  // This will throw name exception
+    if (agentDesc.getVariableLength(variable) != detail::type_decode<InT>::len_t) {
+        THROW exception::UnsupportedVarType("HostAgentAPI::transformReduce() does not support agent array variables.");
+    }
+    if (std::type_index(typeid(typename detail::type_decode<InT>::type_t)) != typ) {
+        THROW exception::InvalidVarType("Wrong variable type passed to HostAgentAPI::transformReduce(). "
+            "This call expects '%s', but '%s' was requested.",
+            typ.name(), typeid(typename detail::type_decode<InT>::type_t).name());
+    }
+    void *var_ptr = agent.getStateVariablePtr(stateName, variable);
+    const auto agentCount = agent.getStateSize(stateName);
+    OutT rtn = thrust::transform_reduce(thrust::cuda::par.on(stream), thrust::device_ptr<InT>(reinterpret_cast<InT*>(var_ptr)), thrust::device_ptr<InT>(reinterpret_cast<InT*>(var_ptr) + agentCount),
+        typename transformOperatorT::template unary_function<InT, OutT>(), init, typename reductionOperatorT::template binary_function<OutT>());
+    gpuErrchkLaunch();
+    return rtn;
+}
+
+
+template<typename VarT>
+void HostAgentAPI::sort(const std::string &variable, Order order, int beginBit, int endBit) {
+    sort_async<VarT>(variable, order, beginBit, endBit, this->api.stream, this->api.streamId);
+    gpuErrchk(cudaStreamSynchronize(this->api.stream));
+}
+template<typename VarT>
+void HostAgentAPI::sort_async(const std::string & variable, Order order, int beginBit, int endBit, const cudaStream_t stream, const unsigned int streamId) {
+    std::shared_ptr<DeviceAgentVector_impl> population = agent.getPopulationVec(stateName);
+    if (population) {
+        if (this->api.stream != stream || this->api.streamId != streamId) {
+            THROW exception::InvalidOperation("Attempting to sync DeviceAgentVector with wrong stream!\nThis should not be possible.\n");
+        }
+        // If the user has a DeviceAgentVector out, sync changes
+        population->syncChanges();
+    }
+    auto &scatter = api.agentModel.singletons->scatter;
+    auto &scan = scatter.Scan();
+    // Check variable is valid
+    CAgentDescription agentDesc(agent.getAgentDescription());
+    std::type_index typ = agentDesc.getVariableType(variable);  // This will throw name exception
+    if (agentDesc.getVariableLength(variable) != 1) {
+        THROW exception::UnsupportedVarType("HostAgentAPI::sort() does not support agent array variables.");
+    }
+    if (std::type_index(typeid(VarT)) != typ) {
+        THROW exception::InvalidVarType("Wrong variable type passed to HostAgentAPI::sort(). "
+            "This call expects '%s', but '%s' was requested.",
+            agentDesc.getVariableType(variable).name(), typeid(VarT).name());
+    }
+    // We will use scan_flag agent_death/message_output here so resize
+    const unsigned int agentCount = agent.getStateSize(stateName);
+    void *var_ptr = agent.getStateVariablePtr(stateName, variable);
+    const size_t total_variable_buffer_size = sizeof(VarT) * agentCount;
+    const unsigned int fake_num_agent = static_cast<unsigned int>(total_variable_buffer_size/sizeof(unsigned int)) +1;
+    scan.resize(fake_num_agent, detail::CUDAScanCompaction::AGENT_DEATH, streamId);
+    scan.resize(agentCount, detail::CUDAScanCompaction::MESSAGE_OUTPUT, streamId);
+    VarT *keys_in = reinterpret_cast<VarT *>(scan.Config(detail::CUDAScanCompaction::Type::AGENT_DEATH, streamId).d_ptrs.scan_flag);
+    VarT *keys_out = reinterpret_cast<VarT *>(scan.Config(detail::CUDAScanCompaction::Type::AGENT_DEATH, streamId).d_ptrs.position);
+    unsigned int *vals_in = scan.Config(detail::CUDAScanCompaction::Type::MESSAGE_OUTPUT, streamId).d_ptrs.scan_flag;
+    unsigned int *vals_out = scan.Config(detail::CUDAScanCompaction::Type::MESSAGE_OUTPUT, streamId).d_ptrs.position;
+    // Create array of TID (use scanflag_death.position)
+    fillTIDArray_async(vals_in, agentCount, stream);
+    // Create array of agent values (use scanflag_death.scan_flag)
+    gpuErrchk(cudaMemcpyAsync(keys_in, var_ptr, total_variable_buffer_size, cudaMemcpyDeviceToDevice, stream));
+    // Check if we need to resize cub storage
+    auto& cub_temp = api.scatter.CubTemp(streamId);
+    // Resize cub storage
+    size_t tempByte = 0;
+    if (order == Asc) {
+        gpuErrchk(cub::DeviceRadixSort::SortPairs(nullptr, tempByte, keys_in, keys_out, vals_in, vals_out, agentCount, beginBit, endBit, stream));
+    } else {
+        gpuErrchk(cub::DeviceRadixSort::SortPairsDescending(nullptr, tempByte, keys_in, keys_out, vals_in, vals_out, agentCount, beginBit, endBit, stream));
+    }
+    cub_temp.resize(tempByte);
+    // pair sort
+    if (order == Asc) {
+        gpuErrchk(cub::DeviceRadixSort::SortPairs(cub_temp.getPtr(), cub_temp.getSize(), keys_in, keys_out, vals_in, vals_out, agentCount, beginBit, endBit, stream));
+    } else {
+        gpuErrchk(cub::DeviceRadixSort::SortPairsDescending(cub_temp.getPtr(), cub_temp.getSize(), keys_in, keys_out, vals_in, vals_out, agentCount, beginBit, endBit, stream));
+    }
+    // Scatter all agent variables
+    api.agentModel.agent_map.at(agentDesc.getName())->scatterSort_async(stateName, scatter, streamId, stream);
+    if (population) {
+        // If the user has a DeviceAgentVector out, purge cache so it redownloads new data on next use
+        population->purgeCache();
+    }
+}
+
+
+template<typename Var1T, typename Var2T>
+void HostAgentAPI::sort(const std::string &variable1, Order order1, const std::string &variable2, Order order2) {
+    sort_async<Var1T, Var2T>(variable1, order1, variable2, order2, this->api.stream, this->api.streamId);
+    gpuErrchk(cudaStreamSynchronize(this->api.stream));
+}
+template<typename Var1T, typename Var2T>
+void HostAgentAPI::sort_async(const std::string & variable1, Order order1, const std::string & variable2, Order order2, const cudaStream_t stream, const unsigned int streamId) {
+    std::shared_ptr<DeviceAgentVector_impl> population = agent.getPopulationVec(stateName);
+    if (population) {
+        if (this->api.stream != stream || this->api.streamId != streamId) {
+            THROW exception::InvalidOperation("Attempting to sync DeviceAgentVector with wrong stream!\nThis should not be possible.\n");
+        }
+        // If the user has a DeviceAgentVector out, sync changes
+        population->syncChanges();
+    }
+    auto &scatter = api.agentModel.singletons->scatter;
+    auto &scan = scatter.Scan();
+    const CAgentDescription agentDesc(agent.getAgentDescription());
+    {  // Check variable 1 is valid
+        std::type_index typ = agentDesc.getVariableType(variable1);  // This will throw name exception
+        if (agentDesc.getVariableLength(variable1) != 1) {
+            THROW exception::UnsupportedVarType("HostAgentAPI::sort() does not support agent array variables.");
+        }
+        if (std::type_index(typeid(Var1T)) != typ) {
+            THROW exception::InvalidVarType("Wrong type for variable '%s' passed to HostAgentAPI::sort(). "
+                "This call expects '%s', but '%s' was requested.",
+                variable1.c_str(), agentDesc.getVariableType(variable1).name(), typeid(Var1T).name());
+        }
+    }
+    {  // Check variable 2 is valid
+        std::type_index typ = agentDesc.getVariableType(variable2);  // This will throw name exception
+        if (agentDesc.getVariableLength(variable2) != 1) {
+            THROW exception::UnsupportedVarType("HostAgentAPI::sort() does not support agent array variables.");
+        }
+        if (std::type_index(typeid(Var2T)) != typ) {
+            THROW exception::InvalidVarType("Wrong type for variable '%s' passed to HostAgentAPI::sort(). "
+                "This call expects '%s', but '%s' was requested.",
+                variable2.c_str(), agentDesc.getVariableType(variable2).name(), typeid(Var2T).name());
+        }
+    }
+    const unsigned int agentCount = agent.getStateSize(stateName);
+    // Fill array with var1 keys
+    {
+        // Resize
+        const size_t total_variable_buffer_size = sizeof(Var1T) * agentCount;
+        const unsigned int fake_num_agent = static_cast<unsigned int>(total_variable_buffer_size/sizeof(unsigned int)) +1;
+        scan.resize(fake_num_agent, detail::CUDAScanCompaction::AGENT_DEATH, streamId);
+        // Fill
+        void *keys1b = scan.Config(detail::CUDAScanCompaction::Type::AGENT_DEATH, streamId).d_ptrs.position;
+        void *var_ptr = agent.getStateVariablePtr(stateName, variable1);
+        gpuErrchk(cudaMemcpyAsync(keys1b, var_ptr, total_variable_buffer_size, cudaMemcpyDeviceToDevice, stream));
+    }
+    // Fill array with var2 keys
+    {
+        // Resize
+        const size_t total_variable_buffer_size = sizeof(Var2T) * agentCount;
+        const unsigned int fake_num_agent = static_cast<unsigned int>(total_variable_buffer_size/sizeof(unsigned int)) +1;
+        scan.resize(std::max(agentCount, fake_num_agent), detail::CUDAScanCompaction::MESSAGE_OUTPUT, streamId);
+        // Fill
+        void *keys2 = scan.Config(detail::CUDAScanCompaction::Type::MESSAGE_OUTPUT, streamId).d_ptrs.scan_flag;
+        void *var_ptr = agent.getStateVariablePtr(stateName, variable2);
+        gpuErrchk(cudaMemcpyAsync(keys2, var_ptr, total_variable_buffer_size, cudaMemcpyDeviceToDevice, stream));
+    }
+    // Define our buffers (here, after resize)
+    Var1T *keys1 = reinterpret_cast<Var1T *>(scan.Config(detail::CUDAScanCompaction::Type::AGENT_DEATH, streamId).d_ptrs.scan_flag);
+    Var1T *keys1b = reinterpret_cast<Var1T *>(scan.Config(detail::CUDAScanCompaction::Type::AGENT_DEATH, streamId).d_ptrs.position);
+    Var2T *keys2 = reinterpret_cast<Var2T *>(scan.Config(detail::CUDAScanCompaction::Type::MESSAGE_OUTPUT, streamId).d_ptrs.scan_flag);
+    unsigned int *vals = scan.Config(detail::CUDAScanCompaction::Type::MESSAGE_OUTPUT, streamId).d_ptrs.position;
+    // Init value array
+    fillTIDArray_async(vals, agentCount, stream);
+    // Process variable 2 first
+    {
+        // pair sort values
+        if (order2 == Asc) {
+            thrust::stable_sort_by_key(thrust::cuda::par.on(stream), thrust::device_ptr<Var2T>(keys2), thrust::device_ptr<Var2T>(keys2 + agentCount),
+            thrust::device_ptr<unsigned int>(vals), thrust::less<Var2T>());
+        } else {
+            thrust::stable_sort_by_key(thrust::cuda::par.on(stream), thrust::device_ptr<Var2T>(keys2), thrust::device_ptr<Var2T>(keys2 + agentCount),
+            thrust::device_ptr<unsigned int>(vals), thrust::greater<Var2T>());
+        }
+        gpuErrchkLaunch();
+        // sort keys1 based on this order
+        sortBuffer_async(keys1, keys1b, vals, sizeof(Var1T), agentCount, stream);
+    }
+    // Process variable 1 second
+    {
+        // pair sort
+        if (order1 == Asc) {
+            thrust::stable_sort_by_key(thrust::cuda::par.on(stream), thrust::device_ptr<Var1T>(keys1), thrust::device_ptr<Var1T>(keys1 + agentCount),
+            thrust::device_ptr<unsigned int>(vals), thrust::less<Var1T>());
+        } else {
+            thrust::stable_sort_by_key(thrust::cuda::par.on(stream), thrust::device_ptr<Var1T>(keys1), thrust::device_ptr<Var1T>(keys1 + agentCount),
+            thrust::device_ptr<unsigned int>(vals), thrust::greater<Var1T>());
+        }
+        gpuErrchkLaunch();
+    }
+    // Scatter all agent variables
+    api.agentModel.agent_map.at(agentDesc.getName())->scatterSort_async(stateName, scatter, streamId, stream);
+
+    if (population) {
+        // If the user has a DeviceAgentVector out, purge cache so it redownloads new data on next use
+        population->purgeCache();
+    }
+}
+
+}  // namespace flamegpu
+
+#endif  // INCLUDE_FLAMEGPU_RUNTIME_AGENT_HOSTAGENTAPI_CUH_
diff --git a/include/flamegpu/runtime/HostNewAgentAPI.h b/include/flamegpu/runtime/agent/HostNewAgentAPI.h
similarity index 85%
rename from include/flamegpu/runtime/HostNewAgentAPI.h
rename to include/flamegpu/runtime/agent/HostNewAgentAPI.h
index d699d4f60..7df6e0200 100644
--- a/include/flamegpu/runtime/HostNewAgentAPI.h
+++ b/include/flamegpu/runtime/agent/HostNewAgentAPI.h
@@ -1,5 +1,5 @@
-#ifndef INCLUDE_FLAMEGPU_RUNTIME_HOSTNEWAGENTAPI_H_
-#define INCLUDE_FLAMEGPU_RUNTIME_HOSTNEWAGENTAPI_H_
+#ifndef INCLUDE_FLAMEGPU_RUNTIME_AGENT_HOSTNEWAGENTAPI_H_
+#define INCLUDE_FLAMEGPU_RUNTIME_AGENT_HOSTNEWAGENTAPI_H_
 
 #include <unordered_map>
 #include <string>
@@ -7,7 +7,7 @@
 
 #include "flamegpu/model/Variable.h"
 #include "flamegpu/defines.h"
-#include "flamegpu/util/type_decode.h"
+#include "flamegpu/detail/type_decode.h"
 
 namespace flamegpu {
 
@@ -138,13 +138,13 @@ struct NewAgentStorage {
                 "in NewAgentStorage::setVariable().",
                 var_name.c_str());
         }
-        const auto t_type = std::type_index(typeid(typename type_decode<T>::type_t));
+        const auto t_type = std::type_index(typeid(typename detail::type_decode<T>::type_t));
         if (var->second.type != t_type) {
             THROW exception::InvalidVarType("Variable '%s' has type '%s', incorrect  type '%s' was requested, "
                 "in NewAgentStorage::setVariable().",
                 var_name.c_str(), var->second.type.name(), t_type.name());
         }
-        if (var->second.len != sizeof(typename type_decode<T>::type_t) * type_decode<T>::len_t) {
+        if (var->second.len != sizeof(typename detail::type_decode<T>::type_t) * detail::type_decode<T>::len_t) {
             THROW exception::InvalidAgentVar("This method is not suitable for agent array variables, "
                 " variable '%s' was passed, "
                 "in NewAgentStorage::setVariable().",
@@ -160,23 +160,23 @@ struct NewAgentStorage {
                 "in NewAgentStorage::setVariable().",
                 var_name.c_str());
         }
-        if (N && N != var->second.len / sizeof(typename type_decode<T>::type_t)) {
+        if (N && N != var->second.len / sizeof(typename detail::type_decode<T>::type_t)) {
             THROW exception::InvalidAgentVar("Agent variable '%s' length mismatch %u != %u, "
                 "in NewAgentStorage::setVariable().",
-                var_name.c_str(), N, var->second.len / sizeof(typename type_decode<T>::type_t));
+                var_name.c_str(), N, var->second.len / sizeof(typename detail::type_decode<T>::type_t));
         }
-        const auto t_type = std::type_index(typeid(typename type_decode<T>::type_t));
+        const auto t_type = std::type_index(typeid(typename detail::type_decode<T>::type_t));
         if (var->second.type != t_type) {
             THROW exception::InvalidVarType("Variable '%s' has type '%s, incorrect  type '%s' was requested, "
                 "in NewAgentStorage::setVariable().",
                 var_name.c_str(), var->second.type.name(), t_type.name());
         }
-        if (var->second.len < (sizeof(typename type_decode<T>::type_t) * type_decode<T>::len_t) * (index + 1)) {
+        if (var->second.len < (sizeof(typename detail::type_decode<T>::type_t) * detail::type_decode<T>::len_t) * (index + 1)) {
             THROW exception::OutOfRangeVarArray("Variable '%s' is an array with %u elements, index %u is out of range, "
                 "in NewAgentStorage::setVariable().",
-                var_name.c_str(), var->second.len / (sizeof(typename type_decode<T>::type_t) * type_decode<T>::len_t), index);
+                var_name.c_str(), var->second.len / (sizeof(typename detail::type_decode<T>::type_t) * detail::type_decode<T>::len_t), index);
         }
-        memcpy(data + var->second.offset + (index * sizeof(typename type_decode<T>::type_t) * type_decode<T>::len_t), &val, sizeof(typename type_decode<T>::type_t) * type_decode<T>::len_t);
+        memcpy(data + var->second.offset + (index * sizeof(typename detail::type_decode<T>::type_t) * detail::type_decode<T>::len_t), &val, sizeof(typename detail::type_decode<T>::type_t) * detail::type_decode<T>::len_t);
     }
 #ifndef SWIG
     template<typename T, unsigned int N>
@@ -192,16 +192,16 @@ struct NewAgentStorage {
         //         "in NewAgentStorage::setVariable().",
         //         var_name.c_str());
         // }
-        const auto t_type = std::type_index(typeid(typename type_decode<T>::type_t));
+        const auto t_type = std::type_index(typeid(typename detail::type_decode<T>::type_t));
         if (var->second.type != t_type) {
             THROW exception::InvalidVarType("Variable '%s' has type '%s, incorrect  type '%s' was requested, "
                 "in NewAgentStorage::setVariable().",
                 var_name.c_str(), var->second.type.name(), t_type.name());
         }
-        if (var->second.len != sizeof(typename type_decode<T>::type_t) * type_decode<T>::len_t * N) {
+        if (var->second.len != sizeof(typename detail::type_decode<T>::type_t) * detail::type_decode<T>::len_t * N) {
             THROW exception::InvalidVarArrayLen("Variable '%s' is an array with %u elements, incorrect array of length %u was provided, "
                 "in NewAgentStorage::setVariable().",
-                var_name.c_str(), var->second.len / (sizeof(typename type_decode<T>::type_t) * type_decode<T>::len_t), N);
+                var_name.c_str(), var->second.len / (sizeof(typename detail::type_decode<T>::type_t) * detail::type_decode<T>::len_t), N);
         }
         memcpy(data + var->second.offset, val.data(), var->second.len);
     }
@@ -219,16 +219,16 @@ struct NewAgentStorage {
         //         "in NewAgentStorage::setVariableArray().",
         //         var_name.c_str());
         // }
-        const auto t_type = std::type_index(typeid(typename type_decode<T>::type_t));
+        const auto t_type = std::type_index(typeid(typename detail::type_decode<T>::type_t));
         if (var->second.type != t_type) {
             THROW exception::InvalidVarType("Variable '%s' has type '%s, incorrect  type '%s' was requested, "
                 "in NewAgentStorage::setVariableArray().",
                 var_name.c_str(), var->second.type.name(), t_type.name());
         }
-        if (var->second.len != sizeof(typename type_decode<T>::type_t) * type_decode<T>::len_t * val.size()) {
+        if (var->second.len != sizeof(typename detail::type_decode<T>::type_t) * detail::type_decode<T>::len_t * val.size()) {
             THROW exception::InvalidVarArrayLen("Variable '%s' is an array with %u elements, incorrect array of length %u was provided, "
                 "in NewAgentStorage::setVariableArray().",
-                var_name.c_str(), var->second.len / (sizeof(typename type_decode<T>::type_t) * type_decode<T>::len_t), val.size());
+                var_name.c_str(), var->second.len / (sizeof(typename detail::type_decode<T>::type_t) * detail::type_decode<T>::len_t), val.size());
         }
         memcpy(data + var->second.offset, val.data(), var->second.len);
     }
@@ -241,13 +241,13 @@ struct NewAgentStorage {
                 "in NewAgentStorage::getVariable()",
                 var_name.c_str());
         }
-        const auto t_type = std::type_index(typeid(typename type_decode<T>::type_t));
+        const auto t_type = std::type_index(typeid(typename detail::type_decode<T>::type_t));
         if (var->second.type != t_type) {
             THROW exception::InvalidVarType("Variable '%s' has type '%s, incorrect  type '%s' was requested, "
                 "in NewAgentStorage::getVariable().",
                 var_name.c_str(), var->second.type.name(), t_type.name());
         }
-        if (var->second.len != sizeof(typename type_decode<T>::type_t) * type_decode<T>::len_t) {
+        if (var->second.len != sizeof(typename detail::type_decode<T>::type_t) * detail::type_decode<T>::len_t) {
             THROW exception::InvalidAgentVar("This method is not suitable for agent array variables, "
                 " variable '%s' was passed, "
                 "in NewAgentStorage::getVariable().",
@@ -263,23 +263,23 @@ struct NewAgentStorage {
                 "in NewAgentStorage::getVariable().",
                 var_name.c_str());
         }
-        if (N && N != var->second.len / sizeof(typename type_decode<T>::type_t)) {
+        if (N && N != var->second.len / sizeof(typename detail::type_decode<T>::type_t)) {
             THROW exception::InvalidAgentVar("Agent variable '%s' length mismatch %u != %u, "
                 "in NewAgentStorage::getVariable().",
-                var_name.c_str(), N, var->second.len / sizeof(typename type_decode<T>::type_t));
+                var_name.c_str(), N, var->second.len / sizeof(typename detail::type_decode<T>::type_t));
         }
-        const auto t_type = std::type_index(typeid(typename type_decode<T>::type_t));
+        const auto t_type = std::type_index(typeid(typename detail::type_decode<T>::type_t));
         if (var->second.type != t_type) {
             THROW exception::InvalidVarType("Variable '%s' has type '%s, incorrect  type '%s' was requested, "
                 "in NewAgentStorage::getVariable().",
                 var_name.c_str(), var->second.type.name(), t_type.name());
         }
-        if (var->second.len < sizeof(typename type_decode<T>::type_t) * type_decode<T>::len_t * (index + 1)) {
+        if (var->second.len < sizeof(typename detail::type_decode<T>::type_t) * detail::type_decode<T>::len_t * (index + 1)) {
             THROW exception::OutOfRangeVarArray("Variable '%s' is an array with %u elements, index %u is out of range, "
                 "in NewAgentStorage::getVariable().",
-                var_name.c_str(), var->second.len / (sizeof(typename type_decode<T>::type_t) * type_decode<T>::len_t), index);
+                var_name.c_str(), var->second.len / (sizeof(typename detail::type_decode<T>::type_t) * detail::type_decode<T>::len_t), index);
         }
-        return *reinterpret_cast<T*>(data + var->second.offset + (index * sizeof(typename type_decode<T>::type_t) * type_decode<T>::len_t));
+        return *reinterpret_cast<T*>(data + var->second.offset + (index * sizeof(typename detail::type_decode<T>::type_t) * detail::type_decode<T>::len_t));
     }
 #ifndef SWIG
     template<typename T, unsigned int N>
@@ -295,16 +295,16 @@ struct NewAgentStorage {
         //         "in NewAgentStorage::getVariable().",
         //         var_name.c_str());
         // }
-        const auto t_type = std::type_index(typeid(typename type_decode<T>::type_t));
+        const auto t_type = std::type_index(typeid(typename detail::type_decode<T>::type_t));
         if (var->second.type != t_type) {
             THROW exception::InvalidVarType("Variable '%s' has type '%s, incorrect  type '%s' was requested, "
                 "in NewAgentStorage::getVariable().",
                 var_name.c_str(), var->second.type.name(), t_type.name());
         }
-        if (var->second.len != sizeof(typename type_decode<T>::type_t) * type_decode<T>::len_t * N) {
+        if (var->second.len != sizeof(typename detail::type_decode<T>::type_t) * detail::type_decode<T>::len_t * N) {
             THROW exception::InvalidVarArrayLen("Variable '%s' is an array with %u elements, incorrect array of length %u was specified, "
                 "in NewAgentStorage::getVariable().",
-                var_name.c_str(), var->second.len / (sizeof(typename type_decode<T>::type_t) * type_decode<T>::len_t), N);
+                var_name.c_str(), var->second.len / (sizeof(typename detail::type_decode<T>::type_t) * detail::type_decode<T>::len_t), N);
         }
         std::array<T, N> rtn;
         memcpy(rtn.data(), data + var->second.offset, var->second.len);
@@ -319,18 +319,18 @@ struct NewAgentStorage {
                 "in NewAgentStorage::getVariableArray().",
                 var_name.c_str());
         }
-        const auto t_type = std::type_index(typeid(typename type_decode<T>::type_t));
+        const auto t_type = std::type_index(typeid(typename detail::type_decode<T>::type_t));
         if (var->second.type != t_type) {
             THROW exception::InvalidVarType("Variable '%s' has type '%s, incorrect  type '%s' was requested, "
                 "in NewAgentStorage::getVariableArray().",
                 var_name.c_str(), var->second.type.name(), t_type.name());
         }
-        if (var->second.len % (sizeof(typename type_decode<T>::type_t) * type_decode<T>::len_t) != 0) {
+        if (var->second.len % (sizeof(typename detail::type_decode<T>::type_t) * detail::type_decode<T>::len_t) != 0) {
             THROW exception::InvalidVarType("Variable '%s' has length (%llu) is not divisible by vector length (%u), "
                 "in NewAgentStorage::getVariableArray().",
-                var_name.c_str(), var->second.len / sizeof(typename type_decode<T>::type_t), type_decode<T>::len_t);
+                var_name.c_str(), var->second.len / sizeof(typename detail::type_decode<T>::type_t), detail::type_decode<T>::len_t);
         }
-        const size_t elements = var->second.len / (sizeof(typename type_decode<T>::type_t) * type_decode<T>::len_t);
+        const size_t elements = var->second.len / (sizeof(typename detail::type_decode<T>::type_t) * detail::type_decode<T>::len_t);
         std::vector<T> rtn(elements);
         memcpy(rtn.data(), data + var->second.offset, var->second.len);
         return rtn;
@@ -455,4 +455,4 @@ class HostNewAgentAPI {
 
 }  // namespace flamegpu
 
-#endif  // INCLUDE_FLAMEGPU_RUNTIME_HOSTNEWAGENTAPI_H_
+#endif  // INCLUDE_FLAMEGPU_RUNTIME_AGENT_HOSTNEWAGENTAPI_H_
diff --git a/include/flamegpu/runtime/detail/curve/DeviceCurve.cuh b/include/flamegpu/runtime/detail/curve/DeviceCurve.cuh
index 97d01b17e..5d23d96f3 100644
--- a/include/flamegpu/runtime/detail/curve/DeviceCurve.cuh
+++ b/include/flamegpu/runtime/detail/curve/DeviceCurve.cuh
@@ -4,7 +4,7 @@
 #include "flamegpu/runtime/detail/SharedBlock.h"
 #include "flamegpu/runtime/detail/curve/Curve.cuh"
 #include "flamegpu/exception/FLAMEGPUDeviceException_device.cuh"
-#include "flamegpu/util/type_decode.h"
+#include "flamegpu/detail/type_decode.h"
 
 #ifdef FLAMEGPU_USE_GLM
 #ifdef __CUDACC__
@@ -268,11 +268,11 @@ __device__ __forceinline__ char* DeviceCurve::getVariablePtr(const char(&variabl
     if (cv == UNKNOWN_VARIABLE) {
         DTHROW("Curve variable with name '%s' was not found.\n", variableName);
         return nullptr;
-    } else if (sm()->curve_type_size[cv] != sizeof(typename type_decode<T>::type_t)) {
-        DTHROW("Curve variable with name '%s', type size mismatch %u != %llu.\n", variableName, sm()->curve_type_size[cv], sizeof(typename type_decode<T>::type_t));
+    } else if (sm()->curve_type_size[cv] != sizeof(typename detail::type_decode<T>::type_t)) {
+        DTHROW("Curve variable with name '%s', type size mismatch %u != %llu.\n", variableName, sm()->curve_type_size[cv], sizeof(typename detail::type_decode<T>::type_t));
         return nullptr;
-    } else if (!(sm()->curve_elements[cv] == type_decode<T>::len_t * N || (namespace_hash == Curve::variableHash("_environment") && N == 0))) {  // Special case, environment can avoid specifying N
-        DTHROW("Curve variable with name '%s', variable array length mismatch %u != %u.\n", variableName, sm()->curve_elements[cv], type_decode<T>::len_t);
+    } else if (!(sm()->curve_elements[cv] == detail::type_decode<T>::len_t * N || (namespace_hash == Curve::variableHash("_environment") && N == 0))) {  // Special case, environment can avoid specifying N
+        DTHROW("Curve variable with name '%s', variable array length mismatch %u != %u.\n", variableName, sm()->curve_elements[cv], detail::type_decode<T>::len_t);
         return nullptr;
     } else if (offset >= sm()->curve_type_size[cv] * sm()->curve_elements[cv] * sm()->curve_count[cv]) {  // Note : offset is basically index * sizeof(T)
         DTHROW("Curve variable with name '%s', offset exceeds buffer length  %u >= %u.\n", offset, sm()->curve_type_size[cv] * sm()->curve_elements[cv] * sm()->curve_count[cv]);
@@ -288,7 +288,7 @@ __device__ __forceinline__ char* DeviceCurve::getVariablePtr(const char(&variabl
 template <typename T, unsigned int N, unsigned int M>
 __device__ __forceinline__ T DeviceCurve::getVariable(const char(&variableName)[M], const VariableHash namespace_hash, const unsigned int agent_index, const unsigned int array_index) {
     using detail::sm;
-    const unsigned int buffer_offset = agent_index * static_cast<unsigned int>(sizeof(T)) * N + array_index * sizeof(typename type_decode<T>::type_t);
+    const unsigned int buffer_offset = agent_index * static_cast<unsigned int>(sizeof(T)) * N + array_index * sizeof(typename detail::type_decode<T>::type_t);
     T *value_ptr = reinterpret_cast<T*>(getVariablePtr<T, N>(variableName, namespace_hash, buffer_offset));
 
 #if !defined(FLAMEGPU_SEATBELTS) || FLAMEGPU_SEATBELTS
@@ -299,7 +299,7 @@ __device__ __forceinline__ T DeviceCurve::getVariable(const char(&variableName)[
 }
 template <typename T, unsigned int N, unsigned int M>
 __device__ __forceinline__ T DeviceCurve::getVariable_ldg(const char(&variableName)[M], const VariableHash namespace_hash, const unsigned int agent_index, const unsigned int array_index) {
-    const unsigned int buffer_offset = agent_index * static_cast<unsigned int>(sizeof(T)) * N + array_index * sizeof(typename type_decode<T>::type_t);
+    const unsigned int buffer_offset = agent_index * static_cast<unsigned int>(sizeof(T)) * N + array_index * sizeof(typename detail::type_decode<T>::type_t);
     T *value_ptr = reinterpret_cast<T*>(getVariablePtr<T, N>(variableName, namespace_hash, buffer_offset));
 
 #if !defined(FLAMEGPU_SEATBELTS) || FLAMEGPU_SEATBELTS
@@ -310,7 +310,7 @@ __device__ __forceinline__ T DeviceCurve::getVariable_ldg(const char(&variableNa
 }
 template <typename T, unsigned int N, unsigned int M>
 __device__ __forceinline__ void DeviceCurve::setVariable(const char(&variableName)[M], const VariableHash namespace_hash, const T variable, const unsigned int agent_index, const unsigned int array_index) {
-    const unsigned int buffer_offset = agent_index * static_cast<unsigned int>(sizeof(T)) * N + array_index * sizeof(typename type_decode<T>::type_t);
+    const unsigned int buffer_offset = agent_index * static_cast<unsigned int>(sizeof(T)) * N + array_index * sizeof(typename detail::type_decode<T>::type_t);
     T* value_ptr = reinterpret_cast<T*>(getVariablePtr<T, N>(variableName, namespace_hash, buffer_offset));
 
 #if !defined(FLAMEGPU_SEATBELTS) || FLAMEGPU_SEATBELTS
diff --git a/include/flamegpu/runtime/detail/curve/HostCurve.cuh b/include/flamegpu/runtime/detail/curve/HostCurve.cuh
index ed34c9ba8..eebd90b59 100644
--- a/include/flamegpu/runtime/detail/curve/HostCurve.cuh
+++ b/include/flamegpu/runtime/detail/curve/HostCurve.cuh
@@ -12,9 +12,9 @@
 #include "flamegpu/exception/FLAMEGPUDeviceException.cuh"
 
 namespace flamegpu {
+namespace detail {
 // forward declare classes from other modules
 class CUDAAgent;
-namespace detail {
 namespace curve {
 
 /**
@@ -90,7 +90,7 @@ class HostCurve {
     /**
      * Has access to call purge
      */
-    friend class flamegpu::CUDAAgent;
+    friend class detail::CUDAAgent;
     /**
      * Host and device storage of curve table
      */
diff --git a/include/flamegpu/runtime/utility/DeviceEnvironment.cuh b/include/flamegpu/runtime/environment/DeviceEnvironment.cuh
similarity index 95%
rename from include/flamegpu/runtime/utility/DeviceEnvironment.cuh
rename to include/flamegpu/runtime/environment/DeviceEnvironment.cuh
index 8f044a997..6efd9a612 100644
--- a/include/flamegpu/runtime/utility/DeviceEnvironment.cuh
+++ b/include/flamegpu/runtime/environment/DeviceEnvironment.cuh
@@ -1,12 +1,12 @@
-#ifndef INCLUDE_FLAMEGPU_RUNTIME_UTILITY_DEVICEENVIRONMENT_CUH_
-#define INCLUDE_FLAMEGPU_RUNTIME_UTILITY_DEVICEENVIRONMENT_CUH_
+#ifndef INCLUDE_FLAMEGPU_RUNTIME_ENVIRONMENT_DEVICEENVIRONMENT_CUH_
+#define INCLUDE_FLAMEGPU_RUNTIME_ENVIRONMENT_DEVICEENVIRONMENT_CUH_
 
 // #include <cuda_runtime.h>
 #include <string>
 #include <cassert>
 
-#include "flamegpu/runtime/utility/DeviceMacroProperty.cuh"
-#include "flamegpu/util/type_decode.h"
+#include "flamegpu/runtime/environment/DeviceMacroProperty.cuh"
+#include "flamegpu/detail/type_decode.h"
 #ifndef __CUDACC_RTC__
 #include "flamegpu/runtime/detail/curve/DeviceCurve.cuh"
 #endif
@@ -127,4 +127,4 @@ __device__ __forceinline__ DeviceMacroProperty<T, I, J, K, W> DeviceEnvironment:
 
 }  // namespace flamegpu
 
-#endif  // INCLUDE_FLAMEGPU_RUNTIME_UTILITY_DEVICEENVIRONMENT_CUH_
+#endif  // INCLUDE_FLAMEGPU_RUNTIME_ENVIRONMENT_DEVICEENVIRONMENT_CUH_
diff --git a/include/flamegpu/runtime/utility/DeviceMacroProperty.cuh b/include/flamegpu/runtime/environment/DeviceMacroProperty.cuh
similarity index 99%
rename from include/flamegpu/runtime/utility/DeviceMacroProperty.cuh
rename to include/flamegpu/runtime/environment/DeviceMacroProperty.cuh
index eeec5ecb7..2c97a8b00 100644
--- a/include/flamegpu/runtime/utility/DeviceMacroProperty.cuh
+++ b/include/flamegpu/runtime/environment/DeviceMacroProperty.cuh
@@ -1,5 +1,5 @@
-#ifndef INCLUDE_FLAMEGPU_RUNTIME_UTILITY_DEVICEMACROPROPERTY_CUH_
-#define INCLUDE_FLAMEGPU_RUNTIME_UTILITY_DEVICEMACROPROPERTY_CUH_
+#ifndef INCLUDE_FLAMEGPU_RUNTIME_ENVIRONMENT_DEVICEMACROPROPERTY_CUH_
+#define INCLUDE_FLAMEGPU_RUNTIME_ENVIRONMENT_DEVICEMACROPROPERTY_CUH_
 
 #include <cstdint>
 #include <limits>
@@ -492,4 +492,4 @@ __device__ __forceinline__ T DeviceMacroProperty<T, I, J, K, W>::exchange(T val)
 
 }  // namespace flamegpu
 
-#endif  // INCLUDE_FLAMEGPU_RUNTIME_UTILITY_DEVICEMACROPROPERTY_CUH_
+#endif  // INCLUDE_FLAMEGPU_RUNTIME_ENVIRONMENT_DEVICEMACROPROPERTY_CUH_
diff --git a/include/flamegpu/runtime/utility/HostEnvironment.cuh b/include/flamegpu/runtime/environment/HostEnvironment.cuh
similarity index 93%
rename from include/flamegpu/runtime/utility/HostEnvironment.cuh
rename to include/flamegpu/runtime/environment/HostEnvironment.cuh
index b5b15b4ac..66c264d8e 100644
--- a/include/flamegpu/runtime/utility/HostEnvironment.cuh
+++ b/include/flamegpu/runtime/environment/HostEnvironment.cuh
@@ -1,5 +1,5 @@
-#ifndef INCLUDE_FLAMEGPU_RUNTIME_UTILITY_HOSTENVIRONMENT_CUH_
-#define INCLUDE_FLAMEGPU_RUNTIME_UTILITY_HOSTENVIRONMENT_CUH_
+#ifndef INCLUDE_FLAMEGPU_RUNTIME_ENVIRONMENT_HOSTENVIRONMENT_CUH_
+#define INCLUDE_FLAMEGPU_RUNTIME_ENVIRONMENT_HOSTENVIRONMENT_CUH_
 
 #include <cuda_runtime.h>
 #include <device_launch_parameters.h>  // Required for FLAMEGPU_SEATBELTS=OFF builds for some reason.
@@ -12,10 +12,10 @@
 #include <vector>
 #include <memory>
 
-#include "flamegpu/gpu/CUDAMacroEnvironment.h"
-#include "flamegpu/gpu/detail/CUDAErrorChecking.cuh"
-#include "flamegpu/runtime/utility/EnvironmentManager.cuh"
-#include "flamegpu/runtime/utility/HostMacroProperty.cuh"
+#include "flamegpu/simulation/detail/CUDAMacroEnvironment.h"
+#include "flamegpu/simulation/detail/CUDAErrorChecking.cuh"
+#include "flamegpu/simulation/detail/EnvironmentManager.cuh"
+#include "flamegpu/runtime/environment/HostMacroProperty.cuh"
 
 namespace flamegpu {
 
@@ -36,15 +36,15 @@ class HostEnvironment {
     /**
      * Constructor, to be called by HostAPI
      */
-    explicit HostEnvironment(unsigned int instance_id, const std::shared_ptr<EnvironmentManager> &env, CUDAMacroEnvironment &_macro_env);
+    explicit HostEnvironment(unsigned int instance_id, const std::shared_ptr<detail::EnvironmentManager> &env, detail::CUDAMacroEnvironment &_macro_env);
     /**
      * Provides access to EnvironmentManager singleton
      */
-    const std::shared_ptr<EnvironmentManager> env_mgr;
+    const std::shared_ptr<detail::EnvironmentManager> env_mgr;
     /**
      * Provides access to macro properties for the instance
      */
-    CUDAMacroEnvironment& macro_env;
+    detail::CUDAMacroEnvironment& macro_env;
     /**
      * Access to instance id of the CUDASimulation
      * This is used to augment all variable names
@@ -230,4 +230,4 @@ HostMacroProperty_swig<T> HostEnvironment::getMacroProperty_swig(const std::stri
 #endif
 }  // namespace flamegpu
 
-#endif  // INCLUDE_FLAMEGPU_RUNTIME_UTILITY_HOSTENVIRONMENT_CUH_
+#endif  // INCLUDE_FLAMEGPU_RUNTIME_ENVIRONMENT_HOSTENVIRONMENT_CUH_
diff --git a/include/flamegpu/runtime/utility/HostMacroProperty.cuh b/include/flamegpu/runtime/environment/HostMacroProperty.cuh
similarity index 99%
rename from include/flamegpu/runtime/utility/HostMacroProperty.cuh
rename to include/flamegpu/runtime/environment/HostMacroProperty.cuh
index a78cd9268..3412d1fc2 100644
--- a/include/flamegpu/runtime/utility/HostMacroProperty.cuh
+++ b/include/flamegpu/runtime/environment/HostMacroProperty.cuh
@@ -1,5 +1,5 @@
-#ifndef INCLUDE_FLAMEGPU_RUNTIME_UTILITY_HOSTMACROPROPERTY_CUH_
-#define INCLUDE_FLAMEGPU_RUNTIME_UTILITY_HOSTMACROPROPERTY_CUH_
+#ifndef INCLUDE_FLAMEGPU_RUNTIME_ENVIRONMENT_HOSTMACROPROPERTY_CUH_
+#define INCLUDE_FLAMEGPU_RUNTIME_ENVIRONMENT_HOSTMACROPROPERTY_CUH_
 
 #include <cstdint>
 #include <algorithm>
@@ -569,4 +569,4 @@ T HostMacroProperty_swig<T>::get() const {
 
 }  // namespace flamegpu
 
-#endif  // INCLUDE_FLAMEGPU_RUNTIME_UTILITY_HOSTMACROPROPERTY_CUH_
+#endif  // INCLUDE_FLAMEGPU_RUNTIME_ENVIRONMENT_HOSTMACROPROPERTY_CUH_
diff --git a/include/flamegpu/runtime/messaging/MessageArray/MessageArrayHost.h b/include/flamegpu/runtime/messaging/MessageArray/MessageArrayHost.h
index f82802d5a..445371b80 100644
--- a/include/flamegpu/runtime/messaging/MessageArray/MessageArrayHost.h
+++ b/include/flamegpu/runtime/messaging/MessageArray/MessageArrayHost.h
@@ -22,7 +22,7 @@ class MessageArray::CUDAModelHandler : public MessageSpecialisationHandler {
      * Allocates memory on device for message list length
      * @param a Parent CUDAMessage, used to access message settings, data ptrs etc
      */
-     explicit CUDAModelHandler(CUDAMessage &a);
+     explicit CUDAModelHandler(detail::CUDAMessage &a);
     /** 
      * Destructor.
      * Should free any local host memory (device memory cannot be freed in destructors)
@@ -35,7 +35,7 @@ class MessageArray::CUDAModelHandler : public MessageSpecialisationHandler {
      * @param streamId The stream index to use for accessing stream specific resources such as scan compaction arrays and buffers
      * @param stream The CUDAStream to use for CUDA operations
      */
-    void init(CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream) override;
+    void init(detail::CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream) override;
     /**
      * Sort messages according to index
      * Detect and report any duplicate indicies/gaps
@@ -43,7 +43,7 @@ class MessageArray::CUDAModelHandler : public MessageSpecialisationHandler {
      * @param streamId The stream index to use for accessing stream specific resources such as scan compaction arrays and buffers
      * @param stream The CUDAStream to use for CUDA operations
      */
-    void buildIndex(CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream) override;
+    void buildIndex(detail::CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream) override;
     /**
      * Allocates memory for the constructed index.
      * The memory allocation is checked by build index.
@@ -70,7 +70,7 @@ class MessageArray::CUDAModelHandler : public MessageSpecialisationHandler {
     /**
      * Owning CUDAMessage, provides access to message storage etc
      */
-    CUDAMessage &sim_message;
+    detail::CUDAMessage &sim_message;
     /**
      * Buffer used by buildIndex if array length > agent count
      */
@@ -91,7 +91,7 @@ struct MessageArray::Data : public MessageBruteForce::Data {
     size_type length;
     virtual ~Data() = default;
 
-    std::unique_ptr<MessageSpecialisationHandler> getSpecialisationHander(CUDAMessage &owner) const override;
+    std::unique_ptr<MessageSpecialisationHandler> getSpecialisationHander(detail::CUDAMessage &owner) const override;
 
     /**
      * Used internally to validate that the corresponding Message type is attached via the agent function shim.
diff --git a/include/flamegpu/runtime/messaging/MessageArray2D/MessageArray2DHost.h b/include/flamegpu/runtime/messaging/MessageArray2D/MessageArray2DHost.h
index fef37e625..448f7d5c2 100644
--- a/include/flamegpu/runtime/messaging/MessageArray2D/MessageArray2DHost.h
+++ b/include/flamegpu/runtime/messaging/MessageArray2D/MessageArray2DHost.h
@@ -23,7 +23,7 @@ class MessageArray2D::CUDAModelHandler : public MessageSpecialisationHandler {
      * Allocates memory on device for message list length
      * @param a Parent CUDAMessage, used to access message settings, data ptrs etc
      */
-     explicit CUDAModelHandler(CUDAMessage &a);
+     explicit CUDAModelHandler(detail::CUDAMessage &a);
     /** 
      * Destructor.
      * Should free any local host memory (device memory cannot be freed in destructors)
@@ -36,7 +36,7 @@ class MessageArray2D::CUDAModelHandler : public MessageSpecialisationHandler {
      * @param streamId The stream index to use for accessing stream specific resources such as scan compaction arrays and buffers
      * @param stream The CUDAStream to use for CUDA operations
      */
-    void init(CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream) override;
+    void init(detail::CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream) override;
     /**
      * Sort messages according to index
      * Detect and report any duplicate indicies/gaps
@@ -44,7 +44,7 @@ class MessageArray2D::CUDAModelHandler : public MessageSpecialisationHandler {
      * @param streamId The stream index to use for accessing stream specific resources such as scan compaction arrays and buffers
      * @param stream The CUDAStream to use for CUDA operations
      */
-    void buildIndex(CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream) override;
+    void buildIndex(detail::CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream) override;
     /**
      * Allocates memory for the constructed index.
      * The memory allocation is checked by build index.
@@ -71,7 +71,7 @@ class MessageArray2D::CUDAModelHandler : public MessageSpecialisationHandler {
     /**
      * Owning CUDAMessage, provides access to message storage etc
      */
-    CUDAMessage &sim_message;
+    detail::CUDAMessage &sim_message;
     /**
      * Buffer used by buildIndex if array length > agent count
      */
@@ -92,7 +92,7 @@ struct MessageArray2D::Data : public MessageBruteForce::Data {
     std::array<size_type, 2> dimensions;
     virtual ~Data() = default;
 
-    std::unique_ptr<MessageSpecialisationHandler> getSpecialisationHander(CUDAMessage &owner) const override;
+    std::unique_ptr<MessageSpecialisationHandler> getSpecialisationHander(detail::CUDAMessage &owner) const override;
 
     /**
      * Used internally to validate that the corresponding Message type is attached via the agent function shim.
diff --git a/include/flamegpu/runtime/messaging/MessageArray3D/MessageArray3DHost.h b/include/flamegpu/runtime/messaging/MessageArray3D/MessageArray3DHost.h
index 6cb264781..9fb9154d6 100644
--- a/include/flamegpu/runtime/messaging/MessageArray3D/MessageArray3DHost.h
+++ b/include/flamegpu/runtime/messaging/MessageArray3D/MessageArray3DHost.h
@@ -23,7 +23,7 @@ class MessageArray3D::CUDAModelHandler : public MessageSpecialisationHandler {
      * Allocates memory on device for message list length
      * @param a Parent CUDAMessage, used to access message settings, data ptrs etc
      */
-     explicit CUDAModelHandler(CUDAMessage &a);
+     explicit CUDAModelHandler(detail::CUDAMessage &a);
     /** 
      * Destructor.
      * Should free any local host memory (device memory cannot be freed in destructors)
@@ -36,7 +36,7 @@ class MessageArray3D::CUDAModelHandler : public MessageSpecialisationHandler {
      * @param streamId The stream index to use for accessing stream specific resources such as scan compaction arrays and buffers
      * @param stream The CUDAStream to use for CUDA operations
      */
-    void init(CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream) override;
+    void init(detail::CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream) override;
     /**
      * Sort messages according to index
      * Detect and report any duplicate indicies/gaps
@@ -44,7 +44,7 @@ class MessageArray3D::CUDAModelHandler : public MessageSpecialisationHandler {
      * @param streamId The stream index to use for accessing stream specific resources such as scan compaction arrays and buffers
      * @param stream The CUDAStream to use for CUDA operations
      */
-    void buildIndex(CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream) override;
+    void buildIndex(detail::CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream) override;
     /**
      * Allocates memory for the constructed index.
      * The memory allocation is checked by build index.
@@ -71,7 +71,7 @@ class MessageArray3D::CUDAModelHandler : public MessageSpecialisationHandler {
     /**
      * Owning CUDAMessage, provides access to message storage etc
      */
-    CUDAMessage &sim_message;
+    detail::CUDAMessage &sim_message;
     /**
      * Buffer used by buildIndex if array length > agent count
      */
@@ -92,7 +92,7 @@ struct MessageArray3D::Data : public MessageBruteForce::Data {
     std::array<size_type, 3> dimensions;
     virtual ~Data() = default;
 
-    std::unique_ptr<MessageSpecialisationHandler> getSpecialisationHander(CUDAMessage &owner) const override;
+    std::unique_ptr<MessageSpecialisationHandler> getSpecialisationHander(detail::CUDAMessage &owner) const override;
 
     /**
      * Used internally to validate that the corresponding Message type is attached via the agent function shim.
diff --git a/include/flamegpu/runtime/messaging/MessageBruteForce/MessageBruteForceHost.h b/include/flamegpu/runtime/messaging/MessageBruteForce/MessageBruteForceHost.h
index 7c3e83d25..99ae38d18 100644
--- a/include/flamegpu/runtime/messaging/MessageBruteForce/MessageBruteForceHost.h
+++ b/include/flamegpu/runtime/messaging/MessageBruteForce/MessageBruteForceHost.h
@@ -8,12 +8,12 @@
 #include <vector>
 
 #include "flamegpu/model/Variable.h"
-#include "flamegpu/gpu/detail/CUDAErrorChecking.cuh"
+#include "flamegpu/simulation/detail/CUDAErrorChecking.cuh"
 
 #include "flamegpu/runtime/messaging/MessageNone/MessageNoneHost.h"
 #include "flamegpu/runtime/messaging/MessageBruteForce.h"
 #include "flamegpu/runtime/messaging/MessageSortingType.h"
-#include "flamegpu/util/type_decode.h"
+#include "flamegpu/detail/type_decode.h"
 
 namespace flamegpu {
 
@@ -28,7 +28,7 @@ class MessageBruteForce::CUDAModelHandler : public MessageSpecialisationHandler
      * Allocates memory on device for message list length
      * @param a Parent CUDAMessage, used to access message settings, data ptrs etc
      */
-    explicit CUDAModelHandler(CUDAMessage &a)
+    explicit CUDAModelHandler(detail::CUDAMessage &a)
         : MessageSpecialisationHandler()
         , d_metadata(nullptr)
         , sim_message(a) { }
@@ -45,14 +45,14 @@ class MessageBruteForce::CUDAModelHandler : public MessageSpecialisationHandler
      * @param streamId The stream index to use for accessing stream specific resources such as scan compaction arrays and buffers
      * @param stream The CUDAStream to use for CUDA operations
      */
-    void init(CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream) override;
+    void init(detail::CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream) override;
     /**
      * Updates the length of the messagelist stored on device
      * @param scatter Scatter instance and scan arrays to be used (CUDASimulation::singletons->scatter)
      * @param streamId The stream index to use for accessing stream specific resources such as scan compaction arrays and buffers
      * @param stream The CUDAStream to use for CUDA operations
      */
-    void buildIndex(CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream) override;
+    void buildIndex(detail::CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream) override;
     /**
      * Allocates memory for the constructed index.
      * The memory allocation is checked by build index.
@@ -79,7 +79,7 @@ class MessageBruteForce::CUDAModelHandler : public MessageSpecialisationHandler
     /**
      * Owning CUDAMessage, provides access to message storage etc
      */
-    CUDAMessage &sim_message;
+    detail::CUDAMessage &sim_message;
 };
 
 /**
@@ -129,7 +129,7 @@ struct MessageBruteForce::Data {
      */
     Data(const Data &other) = delete;
 
-    virtual std::unique_ptr<MessageSpecialisationHandler> getSpecialisationHander(CUDAMessage &owner) const;
+    virtual std::unique_ptr<MessageSpecialisationHandler> getSpecialisationHander(detail::CUDAMessage &owner) const;
 
     /**
      * Used internally to validate that the corresponding Message type is attached via the agent function shim.
@@ -334,9 +334,9 @@ void MessageBruteForce::CDescription::newVariable(const std::string& variable_na
             "in MessageDescription::newVariable().");
     }
     // Array length 0 makes no sense
-    static_assert(type_decode<T>::len_t * N > 0, "A variable cannot have 0 elements.");
+    static_assert(detail::type_decode<T>::len_t * N > 0, "A variable cannot have 0 elements.");
     if (message->variables.find(variable_name) == message->variables.end()) {
-        message->variables.emplace(variable_name, Variable(std::array<typename type_decode<T>::type_t, type_decode<T>::len_t * N>{}));
+        message->variables.emplace(variable_name, Variable(std::array<typename detail::type_decode<T>::type_t, detail::type_decode<T>::len_t * N>{}));
         return;
     }
     THROW exception::InvalidMessageVar("Message ('%s') already contains variable '%s', "
@@ -355,8 +355,8 @@ void MessageBruteForce::CDescription::newVariableArray(const std::string& variab
             "in MessageDescription::newVariable().");
     }
     if (message->variables.find(variable_name) == message->variables.end()) {
-        std::vector<typename type_decode<T>::type_t> temp(static_cast<size_t>(type_decode<T>::len_t * length));
-        message->variables.emplace(variable_name, Variable(type_decode<T>::len_t * length, temp));
+        std::vector<typename detail::type_decode<T>::type_t> temp(static_cast<size_t>(detail::type_decode<T>::len_t * length));
+        message->variables.emplace(variable_name, Variable(detail::type_decode<T>::len_t * length, temp));
         return;
     }
     THROW exception::InvalidMessageVar("Message ('%s') already contains variable '%s', "
diff --git a/include/flamegpu/runtime/messaging/MessageBucket/MessageBucketHost.h b/include/flamegpu/runtime/messaging/MessageBucket/MessageBucketHost.h
index 4592d7815..f9cf8ee9b 100644
--- a/include/flamegpu/runtime/messaging/MessageBucket/MessageBucketHost.h
+++ b/include/flamegpu/runtime/messaging/MessageBucket/MessageBucketHost.h
@@ -23,7 +23,7 @@ class MessageBucket::CUDAModelHandler : public MessageSpecialisationHandler {
     *
     * @param a Parent CUDAMessage, used to access message settings, data ptrs etc
     */
-    explicit CUDAModelHandler(CUDAMessage &a);
+    explicit CUDAModelHandler(detail::CUDAMessage &a);
     /**
     * Destructor
     * Frees all allocated memory
@@ -36,7 +36,7 @@ class MessageBucket::CUDAModelHandler : public MessageSpecialisationHandler {
     * @param streamId Index of stream specific structures used
      * @param stream The CUDAStream to use for CUDA operations
     */
-    void init(CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream) override;
+    void init(detail::CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream) override;
     /**
      * Reconstructs the partition boundary matrix
      * This should be called before reading newly output messages
@@ -44,7 +44,7 @@ class MessageBucket::CUDAModelHandler : public MessageSpecialisationHandler {
      * @param streamId The stream index to use for accessing stream specific resources such as scan compaction arrays and buffers
      * @param stream The CUDAStream to use for CUDA operations
      */
-    void buildIndex(CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream) override;
+    void buildIndex(detail::CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream) override;
     /**
     * Allocates memory for the constructed index.
     * The memory allocation is checked by build index.
@@ -108,7 +108,7 @@ class MessageBucket::CUDAModelHandler : public MessageSpecialisationHandler {
     /**
     * Owning CUDAMessage, provides access to message storage etc
     */
-    CUDAMessage &sim_message;
+    detail::CUDAMessage &sim_message;
 };
 
 /**
@@ -130,7 +130,7 @@ struct MessageBucket::Data : public MessageBruteForce::Data {
     IntT upperBound;
     virtual ~Data() = default;
 
-    std::unique_ptr<MessageSpecialisationHandler> getSpecialisationHander(CUDAMessage &owner) const override;
+    std::unique_ptr<MessageSpecialisationHandler> getSpecialisationHander(detail::CUDAMessage &owner) const override;
 
     /**
     * Used internally to validate that the corresponding Message type is attached via the agent function shim.
diff --git a/include/flamegpu/runtime/messaging/MessageNone/MessageNoneHost.h b/include/flamegpu/runtime/messaging/MessageNone/MessageNoneHost.h
index bc5c9fcb3..28e5024f2 100644
--- a/include/flamegpu/runtime/messaging/MessageNone/MessageNoneHost.h
+++ b/include/flamegpu/runtime/messaging/MessageNone/MessageNoneHost.h
@@ -5,8 +5,9 @@
 #include "flamegpu/runtime/messaging/MessageNone.h"
 
 namespace flamegpu {
-
+namespace detail {
 class CUDAMessage;
+}  // namespace detail
 
 /**
  * Provides specialisation behaviour for messages between agent functions
@@ -18,14 +19,14 @@ class MessageNone::CUDAModelHandler : public MessageSpecialisationHandler {
     /**
      * Constructor
      */
-    explicit CUDAModelHandler(CUDAMessage &a)
+    explicit CUDAModelHandler(detail::CUDAMessage &a)
         : MessageSpecialisationHandler()
         , sim_message(a)
     { }
     /**
      * Owning CUDAMessage
      */
-    CUDAMessage &sim_message;
+    detail::CUDAMessage &sim_message;
 };
 
 }  // namespace flamegpu
diff --git a/include/flamegpu/runtime/messaging/MessageSpatial2D/MessageSpatial2DHost.h b/include/flamegpu/runtime/messaging/MessageSpatial2D/MessageSpatial2DHost.h
index 088cb7825..2986b7a25 100644
--- a/include/flamegpu/runtime/messaging/MessageSpatial2D/MessageSpatial2DHost.h
+++ b/include/flamegpu/runtime/messaging/MessageSpatial2D/MessageSpatial2DHost.h
@@ -24,7 +24,7 @@ class MessageSpatial2D::CUDAModelHandler : public MessageSpecialisationHandler {
      * 
      * @param a Parent CUDAMessage, used to access message settings, data ptrs etc
      */
-     explicit CUDAModelHandler(CUDAMessage &a);
+     explicit CUDAModelHandler(detail::CUDAMessage &a);
     /**
      * Destructor
      * Frees all alocated memory
@@ -37,7 +37,7 @@ class MessageSpatial2D::CUDAModelHandler : public MessageSpecialisationHandler {
      * @param streamId Index of stream specific structures used
      * @param stream The CUDAStream to use for CUDA operations
      */
-    void init(CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream) override;
+    void init(detail::CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream) override;
     /**
      * Reconstructs the partition boundary matrix
      * This should be called before reading newly output messages
@@ -45,7 +45,7 @@ class MessageSpatial2D::CUDAModelHandler : public MessageSpecialisationHandler {
      * @param streamId The stream index to use for accessing stream specific resources such as scan compaction arrays and buffers
      * @param stream The CUDAStream to use for CUDA operations
      */
-    void buildIndex(CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream) override;
+    void buildIndex(detail::CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream) override;
     /**
      * Allocates memory for the constructed index.
      * The memory allocation is checked by build index.
@@ -109,7 +109,7 @@ class MessageSpatial2D::CUDAModelHandler : public MessageSpecialisationHandler {
     /**
      * Owning CUDAMessage, provides access to message storage etc
      */
-    CUDAMessage &sim_message;
+    detail::CUDAMessage &sim_message;
 };
 
 /**
@@ -126,7 +126,7 @@ struct MessageSpatial2D::Data : public MessageBruteForce::Data {
     float maxY;
     virtual ~Data() = default;
 
-    std::unique_ptr<MessageSpecialisationHandler> getSpecialisationHander(CUDAMessage &owner) const override;
+    std::unique_ptr<MessageSpecialisationHandler> getSpecialisationHander(detail::CUDAMessage &owner) const override;
 
     /**
      * Used internally to validate that the corresponding Message type is attached via the agent function shim.
diff --git a/include/flamegpu/runtime/messaging/MessageSpatial3D/MessageSpatial3DHost.h b/include/flamegpu/runtime/messaging/MessageSpatial3D/MessageSpatial3DHost.h
index 185418c8e..85bd026d5 100644
--- a/include/flamegpu/runtime/messaging/MessageSpatial3D/MessageSpatial3DHost.h
+++ b/include/flamegpu/runtime/messaging/MessageSpatial3D/MessageSpatial3DHost.h
@@ -4,7 +4,7 @@
 #include <memory>
 #include <string>
 
-#include "flamegpu/gpu/CUDAMessage.h"
+#include "flamegpu/simulation/detail/CUDAMessage.h"
 #include "flamegpu/util/nvtx.h"
 #include "flamegpu/runtime/messaging/MessageSpatial3D.h"
 #include "flamegpu/runtime/messaging/MessageSpatial2D/MessageSpatial2DHost.h"
@@ -25,7 +25,7 @@ class MessageSpatial3D::CUDAModelHandler : public MessageSpecialisationHandler {
      * 
      * @param a Parent CUDAMessage, used to access message settings, data ptrs etc
      */
-     explicit CUDAModelHandler(CUDAMessage& a);
+     explicit CUDAModelHandler(detail::CUDAMessage& a);
     /**
      * Destructor
      * Frees all alocated memory
@@ -38,7 +38,7 @@ class MessageSpatial3D::CUDAModelHandler : public MessageSpecialisationHandler {
      * @param streamId Index of stream specific structures used
      * @param stream The CUDAStream to use for CUDA operations
      */
-    void init(CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream) override;
+    void init(detail::CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream) override;
     /**
      * Reconstructs the partition boundary matrix
      * This should be called before reading newly output messages
@@ -46,7 +46,7 @@ class MessageSpatial3D::CUDAModelHandler : public MessageSpecialisationHandler {
      * @param streamId The stream index to use for accessing stream specific resources such as scan compaction arrays and buffers
      * @param stream The CUDAStream to use for CUDA operations
      */
-    void buildIndex(CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream) override;
+    void buildIndex(detail::CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream) override;
     /**
      * Allocates memory for the constructed index.
      * The memory allocation is checked by build index.
@@ -110,7 +110,7 @@ class MessageSpatial3D::CUDAModelHandler : public MessageSpecialisationHandler {
     /**
      * Owning CUDAMessage, provides access to message storage etc
      */
-    CUDAMessage &sim_message;
+    detail::CUDAMessage &sim_message;
 };
 
 /**
@@ -124,7 +124,7 @@ struct MessageSpatial3D::Data : public MessageSpatial2D::Data {
     float maxZ;
     virtual ~Data() = default;
 
-    std::unique_ptr<MessageSpecialisationHandler> getSpecialisationHander(CUDAMessage &owner) const override;
+    std::unique_ptr<MessageSpecialisationHandler> getSpecialisationHander(detail::CUDAMessage &owner) const override;
 
     /**
     * Used internally to validate that the corresponding Message type is attached via the agent function shim.
diff --git a/include/flamegpu/runtime/messaging/MessageSpecialisationHandler.h b/include/flamegpu/runtime/messaging/MessageSpecialisationHandler.h
index 65f8936ec..fd0970f9d 100644
--- a/include/flamegpu/runtime/messaging/MessageSpecialisationHandler.h
+++ b/include/flamegpu/runtime/messaging/MessageSpecialisationHandler.h
@@ -2,8 +2,9 @@
 #define INCLUDE_FLAMEGPU_RUNTIME_MESSAGING_MESSAGESPECIALISATIONHANDLER_H_
 
 namespace flamegpu {
-
+namespace detail {
 class CUDAScatter;
+}  // namespace detail
 /**
  * Interface for message specialisation
  * A derived implementation of this is required for each combination of message type (e.g. MessageBruteForce) and simulation type (e.g. CUDASimulation)
@@ -23,7 +24,7 @@ class MessageSpecialisationHandler {
      * @param streamId Index of stream specific structures used
      * @param stream The CUDAStream to use for CUDA operations
      */
-    virtual void init(CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream) = 0;
+    virtual void init(detail::CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream) = 0;
     /**
      * Constructs an index for the message data structure (e.g. Partition boundary matrix for spatial message types)
      * This is called the first time messages are read, after new messages have been output
@@ -31,7 +32,7 @@ class MessageSpecialisationHandler {
      * @param streamId The stream index to use for accessing stream specific resources such as scan compaction arrays and buffers
      * @param stream The CUDAStream to use for CUDA operations
      */
-    virtual void buildIndex(CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream) { }
+    virtual void buildIndex(detail::CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream) { }
     /**
      * Allocates memory for the constructed index.
      * The memory allocation is checked by build index.
diff --git a/include/flamegpu/runtime/utility/AgentRandom.cuh b/include/flamegpu/runtime/random/AgentRandom.cuh
similarity index 88%
rename from include/flamegpu/runtime/utility/AgentRandom.cuh
rename to include/flamegpu/runtime/random/AgentRandom.cuh
index 4c6196cba..3e5a606fc 100644
--- a/include/flamegpu/runtime/utility/AgentRandom.cuh
+++ b/include/flamegpu/runtime/random/AgentRandom.cuh
@@ -1,10 +1,10 @@
-#ifndef INCLUDE_FLAMEGPU_RUNTIME_UTILITY_AGENTRANDOM_CUH_
-#define INCLUDE_FLAMEGPU_RUNTIME_UTILITY_AGENTRANDOM_CUH_
+#ifndef INCLUDE_FLAMEGPU_RUNTIME_RANDOM_AGENTRANDOM_CUH_
+#define INCLUDE_FLAMEGPU_RUNTIME_RANDOM_AGENTRANDOM_CUH_
 
 #include <cassert>
 
-#include "flamegpu/util/detail/curand.cuh"
-#include "flamegpu/util/detail/StaticAssert.h"
+#include "flamegpu/detail/curand.cuh"
+#include "flamegpu/detail/StaticAssert.h"
 #include "flamegpu/exception/FLAMEGPUDeviceException.cuh"
 
 namespace flamegpu {
@@ -21,7 +21,7 @@ class AgentRandom {
      * @param d_rng ThreadSafe device curand state instance
      *   this is a unique instance for the thread among all concurrently executing kernels
      */
-    __forceinline__ __device__ AgentRandom(util::detail::curandState *d_rng);
+    __forceinline__ __device__ AgentRandom(detail::curandState *d_rng);
     /**
      * Returns a float uniformly distributed between 0.0 and 1.0. 
      * @note It may return from 0.0 to 1.0, where 1.0 is included and 0.0 is excluded.
@@ -56,10 +56,10 @@ class AgentRandom {
     /**
      * Thread-safe index for accessing curand
      */
-    util::detail::curandState *d_random_state;
+    detail::curandState *d_random_state;
 };
 
-__forceinline__ __device__ AgentRandom::AgentRandom(util::detail::curandState *d_rng) : d_random_state(d_rng) { }
+__forceinline__ __device__ AgentRandom::AgentRandom(detail::curandState *d_rng) : d_random_state(d_rng) { }
 /**
  * All templates are specialised
  */
@@ -103,7 +103,7 @@ __forceinline__ __device__ double AgentRandom::logNormal(const double mean, cons
 */
 template<typename T>
 __forceinline__ __device__ T AgentRandom::uniform(T min, T max) const {
-    static_assert(util::detail::StaticAssert::_Is_IntType<T>::value, "Invalid template argument for AgentRandom::uniform(T lowerBound, T max)");
+    static_assert(detail::StaticAssert::_Is_IntType<T>::value, "Invalid template argument for AgentRandom::uniform(T lowerBound, T max)");
 #if !defined(FLAMEGPU_SEATBELTS) || FLAMEGPU_SEATBELTS
     if (min > max) {
         DTHROW("Invalid arguments passed to AgentRandom::uniform(), %lld > %lld\n", static_cast<int64_t>(min), static_cast<int64_t>(max));
@@ -150,4 +150,4 @@ __forceinline__ __device__ double AgentRandom::uniform(const double min, const d
 
 }  // namespace flamegpu
 
-#endif  // INCLUDE_FLAMEGPU_RUNTIME_UTILITY_AGENTRANDOM_CUH_
+#endif  // INCLUDE_FLAMEGPU_RUNTIME_RANDOM_AGENTRANDOM_CUH_
diff --git a/include/flamegpu/runtime/utility/HostRandom.cuh b/include/flamegpu/runtime/random/HostRandom.cuh
similarity index 80%
rename from include/flamegpu/runtime/utility/HostRandom.cuh
rename to include/flamegpu/runtime/random/HostRandom.cuh
index 33a013c7d..dd57c84bf 100644
--- a/include/flamegpu/runtime/utility/HostRandom.cuh
+++ b/include/flamegpu/runtime/random/HostRandom.cuh
@@ -1,10 +1,10 @@
-#ifndef INCLUDE_FLAMEGPU_RUNTIME_UTILITY_HOSTRANDOM_CUH_
-#define INCLUDE_FLAMEGPU_RUNTIME_UTILITY_HOSTRANDOM_CUH_
+#ifndef INCLUDE_FLAMEGPU_RUNTIME_RANDOM_HOSTRANDOM_CUH_
+#define INCLUDE_FLAMEGPU_RUNTIME_RANDOM_HOSTRANDOM_CUH_
 
 #include <random>
 
-#include "flamegpu/util/detail/StaticAssert.h"
-#include "flamegpu/runtime/utility/RandomManager.cuh"
+#include "flamegpu/detail/StaticAssert.h"
+#include "flamegpu/simulation/detail/RandomManager.cuh"
 
 namespace flamegpu {
 
@@ -59,36 +59,36 @@ class HostRandom {
     uint64_t getSeed() const;
 
  private:
-    explicit HostRandom(RandomManager &_rng) : rng(_rng) { }
-    RandomManager &rng;
+    explicit HostRandom(detail::RandomManager &_rng) : rng(_rng) { }
+    detail::RandomManager &rng;
 };
 
 
 
 template<typename T>
 inline T HostRandom::uniform() const {
-    static_assert(util::detail::StaticAssert::_Is_RealType<T>::value, "Invalid template argument for HostRandom::uniform()");
+    static_assert(detail::StaticAssert::_Is_RealType<T>::value, "Invalid template argument for HostRandom::uniform()");
     std::uniform_real_distribution<T> dist(0, 1);
     return rng.getDistribution<T>(dist);
 }
 
 template<typename T>
 inline T HostRandom::normal() const {
-    static_assert(util::detail::StaticAssert::_Is_RealType<T>::value, "Invalid template argument for HostRandom::normal()");
+    static_assert(detail::StaticAssert::_Is_RealType<T>::value, "Invalid template argument for HostRandom::normal()");
     std::normal_distribution<T> dist(0, 1);
     return rng.getDistribution<T>(dist);
 }
 
 template<typename T>
 inline T HostRandom::logNormal(const T mean, const T stddev) const {
-    static_assert(util::detail::StaticAssert::_Is_RealType<T>::value, "Invalid template argument for HostRandom::logNormal(T mean, T stddev)");
+    static_assert(detail::StaticAssert::_Is_RealType<T>::value, "Invalid template argument for HostRandom::logNormal(T mean, T stddev)");
     std::lognormal_distribution<T> dist(mean, stddev);
     return rng.getDistribution<T>(dist);
 }
 
 template<typename T>
 inline T HostRandom::uniform(const T min, const T max) const {
-    static_assert(util::detail::StaticAssert::_Is_IntType<T>::value, "Invalid template argument for HostRandom::uniform(T lowerBound, T max)");
+    static_assert(detail::StaticAssert::_Is_IntType<T>::value, "Invalid template argument for HostRandom::uniform(T lowerBound, T max)");
     std::uniform_int_distribution<T> dist(min, max);
     return rng.getDistribution<T>(dist);
 }
@@ -126,4 +126,4 @@ inline double HostRandom::uniform(const double min, const double max) const {
 
 }  // namespace flamegpu
 
-#endif  // INCLUDE_FLAMEGPU_RUNTIME_UTILITY_HOSTRANDOM_CUH_
+#endif  // INCLUDE_FLAMEGPU_RUNTIME_RANDOM_HOSTRANDOM_CUH_
diff --git a/include/flamegpu/sim/AgentLoggingConfig.h b/include/flamegpu/simulation/AgentLoggingConfig.h
similarity index 84%
rename from include/flamegpu/sim/AgentLoggingConfig.h
rename to include/flamegpu/simulation/AgentLoggingConfig.h
index 815d97a99..05fccdea0 100644
--- a/include/flamegpu/sim/AgentLoggingConfig.h
+++ b/include/flamegpu/simulation/AgentLoggingConfig.h
@@ -1,5 +1,5 @@
-#ifndef INCLUDE_FLAMEGPU_SIM_AGENTLOGGINGCONFIG_H_
-#define INCLUDE_FLAMEGPU_SIM_AGENTLOGGINGCONFIG_H_
+#ifndef INCLUDE_FLAMEGPU_SIMULATION_AGENTLOGGINGCONFIG_H_
+#define INCLUDE_FLAMEGPU_SIMULATION_AGENTLOGGINGCONFIG_H_
 
 #include <string>
 #include <memory>
@@ -7,10 +7,10 @@
 #include <mutex>
 #include <utility>
 
-#include "flamegpu/sim/LoggingConfig.h"
-#include "flamegpu/sim/AgentLoggingConfig_Reductions.cuh"
-#include "flamegpu/sim/AgentLoggingConfig_SumReturn.h"
-#include "flamegpu/runtime/HostAgentAPI.cuh"
+#include "flamegpu/simulation/LoggingConfig.h"
+#include "flamegpu/simulation/AgentLoggingConfig_Reductions.cuh"
+#include "flamegpu/simulation/AgentLoggingConfig_SumReturn.h"
+#include "flamegpu/runtime/agent/HostAgentAPI.cuh"
 
 namespace flamegpu {
 
@@ -108,30 +108,30 @@ class AgentLoggingConfig {
  *  this runs on the host as an init/step/exit or host layer function
  */
 template<typename T>
-util::Any getAgentVariableMeanFunc(HostAgentAPI &ai, const std::string &variable_name) {
+detail::Any getAgentVariableMeanFunc(HostAgentAPI &ai, const std::string &variable_name) {
     if (ai.count() > 0)
-        return util::Any(ai.sum<T, typename sum_input_t<T>::result_t>(variable_name) / static_cast<double>(ai.count()));
-    return util::Any(static_cast<double>(0));
+        return detail::Any(ai.sum<T, typename sum_input_t<T>::result_t>(variable_name) / static_cast<double>(ai.count()));
+    return detail::Any(static_cast<double>(0));
 }
 template<typename T>
-util::Any getAgentVariableSumFunc(HostAgentAPI &ai, const std::string &variable_name) {
-    return util::Any(ai.sum<T, typename sum_input_t<T>::result_t>(variable_name));
+detail::Any getAgentVariableSumFunc(HostAgentAPI &ai, const std::string &variable_name) {
+    return detail::Any(ai.sum<T, typename sum_input_t<T>::result_t>(variable_name));
 }
 template<typename T>
-util::Any getAgentVariableMinFunc(HostAgentAPI &ai, const std::string &variable_name) {
-    return util::Any(ai.min<T>(variable_name));
+detail::Any getAgentVariableMinFunc(HostAgentAPI &ai, const std::string &variable_name) {
+    return detail::Any(ai.min<T>(variable_name));
 }
 template<typename T>
-util::Any getAgentVariableMaxFunc(HostAgentAPI &ai, const std::string &variable_name) {
-    return util::Any(ai.max<T>(variable_name));
+detail::Any getAgentVariableMaxFunc(HostAgentAPI &ai, const std::string &variable_name) {
+    return detail::Any(ai.max<T>(variable_name));
 }
 
 template<typename T>
-util::Any getAgentVariableStandardDevFunc(HostAgentAPI &ai, const std::string &variable_name) {
+detail::Any getAgentVariableStandardDevFunc(HostAgentAPI &ai, const std::string &variable_name) {
     // Todo, workout how to make this more multi-thread/deviceable.
     // Todo, streams for the memcpy?
     if (ai.count() == 0)
-        return util::Any(0.0);
+        return detail::Any(0.0);
     // Work out the Mean
     const double mean = ai.sum<T, typename sum_input_t<T>::result_t>(variable_name) / static_cast<double>(ai.count());
     // Then for each number: subtract the Mean and square the result
@@ -141,7 +141,7 @@ util::Any getAgentVariableStandardDevFunc(HostAgentAPI &ai, const std::string &v
     const double variance = ai.transformReduce<T, double>(variable_name, detail::standard_deviation_subtract_mean, detail::standard_deviation_add, 0) / static_cast<double>(ai.count());
     lock.unlock();
     // Take the square root of that and we are done!
-    return util::Any(sqrt(variance));
+    return detail::Any(sqrt(variance));
 }
 
 template<typename T>
@@ -182,4 +182,4 @@ void AgentLoggingConfig::logSum(const std::string &variable_name) {
 
 }  // namespace flamegpu
 
-#endif  // INCLUDE_FLAMEGPU_SIM_AGENTLOGGINGCONFIG_H_
+#endif  // INCLUDE_FLAMEGPU_SIMULATION_AGENTLOGGINGCONFIG_H_
diff --git a/include/flamegpu/sim/AgentLoggingConfig_Reductions.cuh b/include/flamegpu/simulation/AgentLoggingConfig_Reductions.cuh
similarity index 89%
rename from include/flamegpu/sim/AgentLoggingConfig_Reductions.cuh
rename to include/flamegpu/simulation/AgentLoggingConfig_Reductions.cuh
index c799e7335..a804e238c 100644
--- a/include/flamegpu/sim/AgentLoggingConfig_Reductions.cuh
+++ b/include/flamegpu/simulation/AgentLoggingConfig_Reductions.cuh
@@ -1,5 +1,5 @@
-#ifndef INCLUDE_FLAMEGPU_SIM_AGENTLOGGINGCONFIG_REDUCTIONS_CUH_
-#define INCLUDE_FLAMEGPU_SIM_AGENTLOGGINGCONFIG_REDUCTIONS_CUH_
+#ifndef INCLUDE_FLAMEGPU_SIMULATION_AGENTLOGGINGCONFIG_REDUCTIONS_CUH_
+#define INCLUDE_FLAMEGPU_SIMULATION_AGENTLOGGINGCONFIG_REDUCTIONS_CUH_
 
 namespace flamegpu {
 namespace detail {
@@ -55,4 +55,4 @@ __device__ __forceinline__ OutT standard_deviation_subtract_mean_impl::unary_fun
 
 }  // namespace flamegpu
 
-#endif  // INCLUDE_FLAMEGPU_SIM_AGENTLOGGINGCONFIG_REDUCTIONS_CUH_
+#endif  // INCLUDE_FLAMEGPU_SIMULATION_AGENTLOGGINGCONFIG_REDUCTIONS_CUH_
diff --git a/include/flamegpu/sim/AgentLoggingConfig_SumReturn.h b/include/flamegpu/simulation/AgentLoggingConfig_SumReturn.h
similarity index 88%
rename from include/flamegpu/sim/AgentLoggingConfig_SumReturn.h
rename to include/flamegpu/simulation/AgentLoggingConfig_SumReturn.h
index 32d43af59..a605cc52b 100644
--- a/include/flamegpu/sim/AgentLoggingConfig_SumReturn.h
+++ b/include/flamegpu/simulation/AgentLoggingConfig_SumReturn.h
@@ -1,5 +1,5 @@
-#ifndef INCLUDE_FLAMEGPU_SIM_AGENTLOGGINGCONFIG_SUMRETURN_H_
-#define INCLUDE_FLAMEGPU_SIM_AGENTLOGGINGCONFIG_SUMRETURN_H_
+#ifndef INCLUDE_FLAMEGPU_SIMULATION_AGENTLOGGINGCONFIG_SUMRETURN_H_
+#define INCLUDE_FLAMEGPU_SIMULATION_AGENTLOGGINGCONFIG_SUMRETURN_H_
 
 #include <cstdint>
 
@@ -63,4 +63,4 @@ template <typename T> struct sum_input_t { typedef T result_t; };
 
 }  // namespace flamegpu
 
-#endif  // INCLUDE_FLAMEGPU_SIM_AGENTLOGGINGCONFIG_SUMRETURN_H_
+#endif  // INCLUDE_FLAMEGPU_SIMULATION_AGENTLOGGINGCONFIG_SUMRETURN_H_
diff --git a/include/flamegpu/pop/AgentVector.h b/include/flamegpu/simulation/AgentVector.h
similarity index 98%
rename from include/flamegpu/pop/AgentVector.h
rename to include/flamegpu/simulation/AgentVector.h
index 494dd1682..e840c944a 100644
--- a/include/flamegpu/pop/AgentVector.h
+++ b/include/flamegpu/simulation/AgentVector.h
@@ -1,5 +1,5 @@
-#ifndef INCLUDE_FLAMEGPU_POP_AGENTVECTOR_H_
-#define INCLUDE_FLAMEGPU_POP_AGENTVECTOR_H_
+#ifndef INCLUDE_FLAMEGPU_SIMULATION_AGENTVECTOR_H_
+#define INCLUDE_FLAMEGPU_SIMULATION_AGENTVECTOR_H_
 
 #include <string>
 #include <utility>
@@ -7,11 +7,13 @@
 #include <map>
 
 #include "flamegpu/defines.h"
-#include "flamegpu/pop/detail/MemoryVector.h"
+#include "flamegpu/simulation/detail/MemoryVector.h"
 #include "flamegpu/model/AgentData.h"
 
 namespace flamegpu {
-
+namespace detail {
+class CUDAAgentStateList;
+}  // namespace detail
 class AgentInstance;
 class AgentDescription;
 class AgentVector_CAgent;
@@ -33,7 +35,7 @@ class AgentVector {
      * CUDAAgentStateList::getAgentData(AgentVector&) uses private AgentVector::resize(size_type, bool)
      * Can't include CUDAAgentStateList to friend the specific method.
      */
-    friend class CUDAAgentStateList;
+    friend class detail::CUDAAgentStateList;
     friend class AgentVector_CAgent;
     friend class AgentVector_Agent;
 
@@ -621,7 +623,7 @@ class AgentVector {
 }  // namespace flamegpu
 
 // @todo - why is this include part way down?
-#include "flamegpu/pop/AgentVector_Agent.h"
+#include "flamegpu/simulation/AgentVector_Agent.h"
 
 namespace flamegpu {
 
@@ -787,4 +789,4 @@ void AgentVector::py_erase(size_type first, size_type last) {
 
 }  // namespace flamegpu
 
-#endif  // INCLUDE_FLAMEGPU_POP_AGENTVECTOR_H_
+#endif  // INCLUDE_FLAMEGPU_SIMULATION_AGENTVECTOR_H_
diff --git a/include/flamegpu/pop/AgentVector_Agent.h b/include/flamegpu/simulation/AgentVector_Agent.h
similarity index 84%
rename from include/flamegpu/pop/AgentVector_Agent.h
rename to include/flamegpu/simulation/AgentVector_Agent.h
index 24a241577..df9dab60c 100644
--- a/include/flamegpu/pop/AgentVector_Agent.h
+++ b/include/flamegpu/simulation/AgentVector_Agent.h
@@ -1,9 +1,9 @@
-#ifndef INCLUDE_FLAMEGPU_POP_AGENTVECTOR_AGENT_H_
-#define INCLUDE_FLAMEGPU_POP_AGENTVECTOR_AGENT_H_
+#ifndef INCLUDE_FLAMEGPU_SIMULATION_AGENTVECTOR_AGENT_H_
+#define INCLUDE_FLAMEGPU_SIMULATION_AGENTVECTOR_AGENT_H_
 
 /**
  * THIS CLASS SHOULD NOT BE INCLUDED DIRECTLY
- * Include flamegpu/pop/AgentVector.h instead
+ * Include flamegpu/simulation/AgentVector.h instead
  * Use AgentVector::CAgent instead of AgentVector_CAgent
  * Use AgentVector::Agent instead of AgentVector_Agent
  */
@@ -13,8 +13,8 @@
 #include <utility>
 #include <memory>
 
-#include "flamegpu/pop/AgentVector.h"
-#include "flamegpu/util/type_decode.h"
+#include "flamegpu/simulation/AgentVector.h"
+#include "flamegpu/detail/type_decode.h"
 
 namespace flamegpu {
 
@@ -151,16 +151,16 @@ void AgentVector_Agent::setVariable(const std::string &variable_name, const T va
             variable_name.c_str());
     }
     auto& v_buff = v_it->second;
-    if (v_buff->getElements() != type_decode<T>::len_t) {
+    if (v_buff->getElements() != detail::type_decode<T>::len_t) {
         THROW exception::InvalidVarType("Variable '%s' is an array variable, use the array method instead, "
             "in AgentVector_Agent::setVariable().",
             variable_name.c_str());
     }
-    if (v_buff->getType() != std::type_index(typeid(typename type_decode<T>::type_t))) {
+    if (v_buff->getType() != std::type_index(typeid(typename detail::type_decode<T>::type_t))) {
         THROW exception::InvalidVarType("Variable '%s' is of a different type. "
             "'%s' was expected, but '%s' was requested,"
             "in AgentVector_Agent::setVariable().",
-            variable_name.c_str(), v_buff->getType().name(), typeid(typename type_decode<T>::type_t).name());
+            variable_name.c_str(), v_buff->getType().name(), typeid(typename detail::type_decode<T>::type_t).name());
     }
     _parent->_require(variable_name);
     // do the replace
@@ -186,21 +186,21 @@ void AgentVector_Agent::setVariable(const std::string &variable_name, const std:
             variable_name.c_str());
     }
     auto& v_buff = v_it->second;
-    if (v_buff->getElements() % type_decode<T>::len_t != 0) {
+    if (v_buff->getElements() % detail::type_decode<T>::len_t != 0) {
         THROW exception::InvalidVarType("Variable array length (%u) is not divisible by vector type length (%u) for variable '%s',  "
             "in AgentVector_Agent::getVariable().",
-            v_buff->getElements(), type_decode<T>::len_t, variable_name.c_str());
+            v_buff->getElements(), detail::type_decode<T>::len_t, variable_name.c_str());
     }
-    if (v_buff->getElements() != N * type_decode<T>::len_t) {
+    if (v_buff->getElements() != N * detail::type_decode<T>::len_t) {
         THROW exception::InvalidVarType("Variable '%s' has '%u' elements, but an array of length %u was passed, "
             "in AgentVector_Agent::setVariable().",
             variable_name.c_str(), v_buff->getElements(), N);
     }
-    if (v_buff->getType() != std::type_index(typeid(typename type_decode<T>::type_t))) {
+    if (v_buff->getType() != std::type_index(typeid(typename detail::type_decode<T>::type_t))) {
         THROW exception::InvalidVarType("Variable '%s' is of a different type. "
             "'%s' was expected, but '%s' was requested,"
             "in AgentVector_Agent::setVariable().",
-            variable_name.c_str(), v_buff->getType().name(), typeid(typename type_decode<T>::type_t).name());
+            variable_name.c_str(), v_buff->getType().name(), typeid(typename detail::type_decode<T>::type_t).name());
     }
     _parent->_require(variable_name);
     memcpy(static_cast<T*>(v_buff->getDataPtr()) + (index * N), value.data(), sizeof(T) * N);
@@ -230,25 +230,25 @@ void AgentVector_Agent::setVariable(const std::string &variable_name, const unsi
             "in AgentVector_Agent::setVariable()\n",
             variable_name.c_str(), N, v_buff->getElements());
     }
-    if (v_buff->getElements() % type_decode<T>::len_t != 0) {
+    if (v_buff->getElements() % detail::type_decode<T>::len_t != 0) {
         THROW exception::InvalidVarType("Variable array length (%u) is not divisible by vector type length (%u) for variable '%s',  "
             "in AgentVector_Agent::setVariable().",
-            v_buff->getElements(), type_decode<T>::len_t, variable_name.c_str());
+            v_buff->getElements(), detail::type_decode<T>::len_t, variable_name.c_str());
     }
-    if (v_buff->getType() != std::type_index(typeid(typename type_decode<T>::type_t))) {
+    if (v_buff->getType() != std::type_index(typeid(typename detail::type_decode<T>::type_t))) {
         THROW exception::InvalidVarType("Variable '%s' is of a different type. "
             "'%s' was expected, but '%s' was requested,"
             "in AgentVector_Agent::setVariable().",
-            variable_name.c_str(), v_buff->getType().name(), typeid(typename type_decode<T>::type_t).name());
+            variable_name.c_str(), v_buff->getType().name(), typeid(typename detail::type_decode<T>::type_t).name());
     }
-    const unsigned int t_index = type_decode<T>::len_t * array_index + type_decode<T>::len_t;
+    const unsigned int t_index = detail::type_decode<T>::len_t * array_index + detail::type_decode<T>::len_t;
     if (t_index > v_buff->getElements() || t_index < array_index) {
         THROW exception::OutOfBoundsException("Index '%u' exceeds array bounds [0-%u) of variable '%s',  "
             "in AgentVector_Agent::setVariable().",
-            array_index, v_buff->getElements() / type_decode<T>::len_t, variable_name.c_str());
+            array_index, v_buff->getElements() / detail::type_decode<T>::len_t, variable_name.c_str());
     }
     _parent->_require(variable_name);
-    static_cast<T*>(v_buff->getDataPtr())[(index * (v_buff->getElements() / type_decode<T>::len_t)) + array_index] = value;
+    static_cast<T*>(v_buff->getDataPtr())[(index * (v_buff->getElements() / detail::type_decode<T>::len_t)) + array_index] = value;
 }
 #ifdef SWIG
 template <typename T>
@@ -269,16 +269,16 @@ void AgentVector_Agent::setVariableArray(const std::string &variable_name, const
             variable_name.c_str());
     }
     auto& v_buff = v_it->second;
-    if (v_buff->getElements() != value.size() * type_decode<T>::len_t) {
+    if (v_buff->getElements() != value.size() * detail::type_decode<T>::len_t) {
         THROW exception::InvalidVarType("Variable '%s' has '%u' elements, but an array of length %u was passed, "
             "in AgentVector_Agent::setVariableArray().",
-            variable_name.c_str(), v_buff->getElements(), value.size() * type_decode<T>::len_t);
+            variable_name.c_str(), v_buff->getElements(), value.size() * detail::type_decode<T>::len_t);
     }
-    if (v_buff->getType() != std::type_index(typeid(typename type_decode<T>::type_t))) {
+    if (v_buff->getType() != std::type_index(typeid(typename detail::type_decode<T>::type_t))) {
         THROW exception::InvalidVarType("Variable '%s' is of a different type. "
             "'%s' was expected, but '%s' was requested,"
             "in AgentVector_Agent::setVariableArray().",
-            variable_name.c_str(), v_buff->getType().name(), typeid(typename type_decode<T>::type_t).name());
+            variable_name.c_str(), v_buff->getType().name(), typeid(typename detail::type_decode<T>::type_t).name());
     }
     _parent->_require(variable_name);
     memcpy(static_cast<T*>(v_buff->getDataPtr()) + (index * v_buff->getElements()), value.data(), sizeof(T) * v_buff->getElements());
@@ -301,16 +301,16 @@ T AgentVector_CAgent::getVariable(const std::string &variable_name) const {
             variable_name.c_str());
     }
     const auto &v_buff = v_it->second;
-    if (v_buff->getElements() != type_decode<T>::len_t) {
+    if (v_buff->getElements() != detail::type_decode<T>::len_t) {
         THROW exception::InvalidVarType("Variable '%s' is an array variable, use the array method instead, "
             "in AgentVector_Agent::getVariable().",
             variable_name.c_str());
     }
-    if (v_buff->getType() != std::type_index(typeid(typename type_decode<T>::type_t))) {
+    if (v_buff->getType() != std::type_index(typeid(typename detail::type_decode<T>::type_t))) {
         THROW exception::InvalidVarType("Variable '%s' is of a different type. "
             "'%s' was expected, but '%s' was requested,"
             "in AgentVector_Agent::getVariable().",
-            variable_name.c_str(), v_buff->getType().name(), typeid(typename type_decode<T>::type_t).name());
+            variable_name.c_str(), v_buff->getType().name(), typeid(typename detail::type_decode<T>::type_t).name());
     }
     _parent->_require(variable_name);
     return static_cast<const T*>(v_buff->getReadOnlyDataPtr())[index];
@@ -329,21 +329,21 @@ std::array<T, N> AgentVector_CAgent::getVariable(const std::string &variable_nam
             variable_name.c_str());
     }
     const auto& v_buff = v_it->second;
-    if (v_buff->getElements() % type_decode<T>::len_t != 0) {
+    if (v_buff->getElements() % detail::type_decode<T>::len_t != 0) {
         THROW exception::InvalidVarType("Variable array length (%u) is not divisible by vector type length (%u) for variable '%s',  "
             "in AgentVector_Agent::getVariable().",
-            v_buff->getElements(), type_decode<T>::len_t, variable_name.c_str());
+            v_buff->getElements(), detail::type_decode<T>::len_t, variable_name.c_str());
     }
-    if (v_buff->getElements() != N * type_decode<T>::len_t) {
+    if (v_buff->getElements() != N * detail::type_decode<T>::len_t) {
         THROW exception::InvalidVarType("Variable '%s' has '%u' elements, but an array of length %u was passed, "
             "in AgentVector_Agent::getVariable().",
-            variable_name.c_str(), v_buff->getElements() / type_decode<T>::len_t, N);
+            variable_name.c_str(), v_buff->getElements() / detail::type_decode<T>::len_t, N);
     }
-    if (v_buff->getType() != std::type_index(typeid(typename type_decode<T>::type_t))) {
+    if (v_buff->getType() != std::type_index(typeid(typename detail::type_decode<T>::type_t))) {
         THROW exception::InvalidVarType("Variable '%s' is of a different type. "
             "'%s' was expected, but '%s' was requested,"
             "in AgentVector_Agent::getVariable().",
-            variable_name.c_str(), v_buff->getType().name(), typeid(typename type_decode<T>::type_t).name());
+            variable_name.c_str(), v_buff->getType().name(), typeid(typename detail::type_decode<T>::type_t).name());
     }
     _parent->_require(variable_name);
     std::array<T, N> rtn;
@@ -369,25 +369,25 @@ T AgentVector_CAgent::getVariable(const std::string &variable_name, const unsign
             "in AgentVector_Agent::getVariable()\n",
             variable_name.c_str(), N, v_buff->getElements());
     }
-    if (v_buff->getElements() % type_decode<T>::len_t != 0) {
+    if (v_buff->getElements() % detail::type_decode<T>::len_t != 0) {
         THROW exception::InvalidVarType("Variable array length (%u) is not divisible by vector type length (%u) for variable '%s',  "
             "in AgentVector_Agent::getVariable().",
-            v_buff->getElements(), type_decode<T>::len_t,  variable_name.c_str());
+            v_buff->getElements(), detail::type_decode<T>::len_t,  variable_name.c_str());
     }
-    const unsigned int t_index = type_decode<T>::len_t * array_index + type_decode<T>::len_t;
+    const unsigned int t_index = detail::type_decode<T>::len_t * array_index + detail::type_decode<T>::len_t;
     if (t_index > v_buff->getElements() || t_index < array_index) {
         THROW exception::OutOfBoundsException("Index '%u' exceeds array bounds [0-%u) of variable '%s',  "
             "in AgentVector_Agent::getVariable().",
-            array_index, v_buff->getElements() / type_decode<T>::len_t, variable_name.c_str());
+            array_index, v_buff->getElements() / detail::type_decode<T>::len_t, variable_name.c_str());
     }
-    if (v_buff->getType() != std::type_index(typeid(typename type_decode<T>::type_t))) {
+    if (v_buff->getType() != std::type_index(typeid(typename detail::type_decode<T>::type_t))) {
         THROW exception::InvalidVarType("Variable '%s' is of a different type. "
             "'%s' was expected, but '%s' was requested,"
             "in AgentVector_Agent::getVariable().",
-            variable_name.c_str(), v_buff->getType().name(), typeid(typename type_decode<T>::type_t).name());
+            variable_name.c_str(), v_buff->getType().name(), typeid(typename detail::type_decode<T>::type_t).name());
     }
     _parent->_require(variable_name);
-    return static_cast<const T*>(v_buff->getReadOnlyDataPtr())[(index * (v_buff->getElements() / type_decode<T>::len_t)) + array_index];
+    return static_cast<const T*>(v_buff->getReadOnlyDataPtr())[(index * (v_buff->getElements() / detail::type_decode<T>::len_t)) + array_index];
 }
 #ifdef SWIG
 template <typename T>
@@ -404,19 +404,19 @@ std::vector<T> AgentVector_CAgent::getVariableArray(const std::string& variable_
             variable_name.c_str());
     }
     const auto& v_buff = v_it->second;
-    if (v_buff->getType() != std::type_index(typeid(typename type_decode<T>::type_t))) {
+    if (v_buff->getType() != std::type_index(typeid(typename detail::type_decode<T>::type_t))) {
         THROW exception::InvalidVarType("Variable '%s' is of a different type. "
             "'%s' was expected, but '%s' was requested,"
             "in AgentVector_Agent::getVariableArray().",
-            variable_name.c_str(), v_buff->getType().name(), typeid(typename type_decode<T>::type_t).name());
+            variable_name.c_str(), v_buff->getType().name(), typeid(typename detail::type_decode<T>::type_t).name());
     }
     _parent->_require(variable_name);
-    std::vector<T> rtn(static_cast<size_t>(v_buff->getElements() / type_decode<T>::len_t));
-    memcpy(rtn.data(), static_cast<T*>(v_buff->getDataPtr()) + (index * (v_buff->getElements() / type_decode<T>::len_t)), sizeof(typename type_decode<T>::type_t) * v_buff->getElements());
+    std::vector<T> rtn(static_cast<size_t>(v_buff->getElements() / detail::type_decode<T>::len_t));
+    memcpy(rtn.data(), static_cast<T*>(v_buff->getDataPtr()) + (index * (v_buff->getElements() / detail::type_decode<T>::len_t)), sizeof(typename detail::type_decode<T>::type_t) * v_buff->getElements());
     return rtn;
 }
 #endif  // IFDEF SWIG
 
 }  // namespace flamegpu
 
-#endif  // INCLUDE_FLAMEGPU_POP_AGENTVECTOR_AGENT_H_
+#endif  // INCLUDE_FLAMEGPU_SIMULATION_AGENTVECTOR_AGENT_H_
diff --git a/include/flamegpu/gpu/CUDAEnsemble.h b/include/flamegpu/simulation/CUDAEnsemble.h
similarity index 97%
rename from include/flamegpu/gpu/CUDAEnsemble.h
rename to include/flamegpu/simulation/CUDAEnsemble.h
index dfa360fda..2cf473869 100644
--- a/include/flamegpu/gpu/CUDAEnsemble.h
+++ b/include/flamegpu/simulation/CUDAEnsemble.h
@@ -1,5 +1,5 @@
-#ifndef INCLUDE_FLAMEGPU_GPU_CUDAENSEMBLE_H_
-#define INCLUDE_FLAMEGPU_GPU_CUDAENSEMBLE_H_
+#ifndef INCLUDE_FLAMEGPU_SIMULATION_CUDAENSEMBLE_H_
+#define INCLUDE_FLAMEGPU_SIMULATION_CUDAENSEMBLE_H_
 
 #include <string>
 #include <memory>
@@ -184,4 +184,4 @@ class CUDAEnsemble {
 
 }  // namespace flamegpu
 
-#endif  // INCLUDE_FLAMEGPU_GPU_CUDAENSEMBLE_H_
+#endif  // INCLUDE_FLAMEGPU_SIMULATION_CUDAENSEMBLE_H_
diff --git a/include/flamegpu/gpu/CUDASimulation.h b/include/flamegpu/simulation/CUDASimulation.h
similarity index 94%
rename from include/flamegpu/gpu/CUDASimulation.h
rename to include/flamegpu/simulation/CUDASimulation.h
index 1dd0439d3..b7fdf8757 100644
--- a/include/flamegpu/gpu/CUDASimulation.h
+++ b/include/flamegpu/simulation/CUDASimulation.h
@@ -1,5 +1,5 @@
-#ifndef INCLUDE_FLAMEGPU_GPU_CUDASIMULATION_H_
-#define INCLUDE_FLAMEGPU_GPU_CUDASIMULATION_H_
+#ifndef INCLUDE_FLAMEGPU_SIMULATION_CUDASIMULATION_H_
+#define INCLUDE_FLAMEGPU_SIMULATION_CUDASIMULATION_H_
 #include <atomic>
 #include <memory>
 #include <vector>
@@ -9,14 +9,14 @@
 #include <set>
 
 #include "flamegpu/exception/FLAMEGPUDeviceException.cuh"
-#include "flamegpu/sim/Simulation.h"
+#include "flamegpu/simulation/Simulation.h"
 #include "flamegpu/runtime/detail/curve/HostCurve.cuh"
-#include "flamegpu/gpu/CUDAScatter.cuh"
-#include "flamegpu/gpu/CUDAEnsemble.h"
-#include "flamegpu/runtime/utility/RandomManager.cuh"
-#include "flamegpu/runtime/HostNewAgentAPI.h"
-#include "flamegpu/gpu/CUDAMacroEnvironment.h"
-#include "flamegpu/runtime/utility/EnvironmentManager.cuh"
+#include "flamegpu/simulation/detail/CUDAScatter.cuh"
+#include "flamegpu/simulation/CUDAEnsemble.h"
+#include "flamegpu/simulation/detail/RandomManager.cuh"
+#include "flamegpu/runtime/agent/HostNewAgentAPI.h"
+#include "flamegpu/simulation/detail/CUDAMacroEnvironment.h"
+#include "flamegpu/simulation/detail/EnvironmentManager.cuh"
 
 #ifdef FLAMEGPU_VISUALISATION
 #include "flamegpu/visualiser/ModelVis.h"
@@ -31,10 +31,13 @@
 #endif
 
 namespace flamegpu {
-
-class AgentVector;
+namespace detail {
+class SimRunner;
 class CUDAAgent;
 class CUDAMessage;
+}  // namespace detail
+
+class AgentVector;
 class LoggingConfig;
 class StepLoggingConfig;
 class RunPlan;
@@ -50,7 +53,7 @@ class CUDASimulation : public Simulation {
      * Requires internal access to scan/scatter singletons
      */
     friend class HostAgentAPI;
-    friend class SimRunner;
+    friend class detail::SimRunner;
     friend class CUDAEnsemble;
 #ifdef FLAMEGPU_VISUALISATION
     friend struct visualiser::ModelVisData;
@@ -59,12 +62,12 @@ class CUDASimulation : public Simulation {
      * Map of a number of CUDA agents by name.
      * The CUDA agents are responsible for allocating and managing all the device memory
      */
-    typedef std::unordered_map<std::string, std::unique_ptr<CUDAAgent>> CUDAAgentMap;
+    typedef std::unordered_map<std::string, std::unique_ptr<detail::CUDAAgent>> CUDAAgentMap;
     /**
      * Map of a number of CUDA messages by name.
      * The CUDA messages are responsible for allocating and managing all the device memory
      */
-    typedef std::unordered_map<std::string, std::unique_ptr<CUDAMessage>> CUDAMessageMap;
+    typedef std::unordered_map<std::string, std::unique_ptr<detail::CUDAMessage>> CUDAMessageMap;
     /**
      * Map of a number of CUDA sub models by name.
      * The CUDA submodels are responsible for allocating and managing all the device memory of non mapped agent vars
@@ -77,7 +80,7 @@ class CUDASimulation : public Simulation {
      * CUDA runner specific config
      */
     struct Config {
-        friend class SimRunner;
+        friend class detail::SimRunner;
         friend class CUDASimulation;
         /**
          * GPU to execute model on
@@ -251,13 +254,13 @@ class CUDASimulation : public Simulation {
      * Returns the manager for the specified agent
      * @todo remove? this is mostly internal methods that modeller doesn't need access to
      */
-    CUDAAgent& getCUDAAgent(const std::string &agent_name) const;
-    AgentInterface &getAgent(const std::string &name) override;
+    detail::CUDAAgent& getCUDAAgent(const std::string &agent_name) const;
+    detail::AgentInterface &getAgent(const std::string &name) override;
     /**
      * Returns the manager for the specified agent
      * @todo remove? this is mostly internal methods that modeller doesn't need access to
      */
-    CUDAMessage& getCUDAMessage(const std::string &message_name) const;
+    detail::CUDAMessage& getCUDAMessage(const std::string &message_name) const;
     /**
      * @return A mutable reference to the cuda model specific configuration struct
      * @see Simulation::applyConfig() Should be called afterwards to apply changes
@@ -410,7 +413,7 @@ class CUDASimulation : public Simulation {
     /**
      * Macro env property storage
      */
-    CUDAMacroEnvironment macro_env;
+    detail::CUDAMacroEnvironment macro_env;
     /**
      * Internal model config
      */
@@ -517,22 +520,22 @@ class CUDASimulation : public Simulation {
         /**
          * Resizes device random array during step()
          */
-        RandomManager rng;
+        detail::RandomManager rng;
         /**
          * Held here for tracking when to release cuda memory
          */
-        CUDAScatter scatter;
+        detail::CUDAScatter scatter;
         /**
          * Held here for tracking when to release cuda memory
          */
-        std::shared_ptr<EnvironmentManager> environment;
+        std::shared_ptr<detail::EnvironmentManager> environment;
 #if !defined(FLAMEGPU_SEATBELTS) || FLAMEGPU_SEATBELTS
         /**
          * Provides buffers for device error checking
          */
         exception::DeviceExceptionManager exception;
 #endif
-        explicit Singletons(const std::shared_ptr<EnvironmentManager> &environment) : environment(environment) { }
+        explicit Singletons(const std::shared_ptr<detail::EnvironmentManager> &environment) : environment(environment) { }
     } * singletons;
     /**
      * Common method for adding this Model's data to env manager
@@ -581,7 +584,7 @@ class CUDASimulation : public Simulation {
     typedef std::unordered_map<std::string, AgentDataBufferStateMap> AgentDataMap;
 
  private:
-    std::shared_ptr<EnvironmentManager> getEnvironment() const override;
+    std::shared_ptr<detail::EnvironmentManager> getEnvironment() const override;
     void assignAgentIDs();
     /**
      * Set to false whenever an agent population is imported from outside
@@ -682,4 +685,4 @@ std::vector<T> CUDASimulation::getEnvironmentPropertyArray(const std::string& pr
 #endif
 }  // namespace flamegpu
 
-#endif  // INCLUDE_FLAMEGPU_GPU_CUDASIMULATION_H_
+#endif  // INCLUDE_FLAMEGPU_SIMULATION_CUDASIMULATION_H_
diff --git a/include/flamegpu/sim/LogFrame.h b/include/flamegpu/simulation/LogFrame.h
similarity index 94%
rename from include/flamegpu/sim/LogFrame.h
rename to include/flamegpu/simulation/LogFrame.h
index 2e22d7924..64582eb60 100644
--- a/include/flamegpu/sim/LogFrame.h
+++ b/include/flamegpu/simulation/LogFrame.h
@@ -1,5 +1,5 @@
-#ifndef INCLUDE_FLAMEGPU_SIM_LOGFRAME_H_
-#define INCLUDE_FLAMEGPU_SIM_LOGFRAME_H_
+#ifndef INCLUDE_FLAMEGPU_SIMULATION_LOGFRAME_H_
+#define INCLUDE_FLAMEGPU_SIMULATION_LOGFRAME_H_
 
 #include "AgentLoggingConfig.h"
 
@@ -9,8 +9,8 @@
 #include <utility>
 #include <vector>
 
-#include "flamegpu/sim/LoggingConfig.h"
-#include "flamegpu/util/Any.h"
+#include "flamegpu/simulation/LoggingConfig.h"
+#include "flamegpu/detail/Any.h"
 #include "flamegpu/exception/FLAMEGPUException.h"
 
 namespace flamegpu {
@@ -32,8 +32,8 @@ struct LogFrame {
     /**
      * Creates a log with pre-populated data
      */
-    LogFrame(const std::map<std::string, util::Any> &_environment,
-    const std::map<util::StringPair, std::pair<std::map<LoggingConfig::NameReductionFn, util::Any>, unsigned int>> &_agents,
+    LogFrame(const std::map<std::string, detail::Any> &_environment,
+    const std::map<util::StringPair, std::pair<std::map<LoggingConfig::NameReductionFn, detail::Any>, unsigned int>> &_agents,
     unsigned int _step_count);
     /**
      * Returns the step count of the log
@@ -65,15 +65,15 @@ struct LogFrame {
     /**
      * Raw access to environment log map
      */
-    const std::map<std::string, util::Any> &getEnvironment() const { return environment; }
+    const std::map<std::string, detail::Any> &getEnvironment() const { return environment; }
     /**
      * Raw access to agent log map
      */
-    const std::map<util::StringPair, std::pair<std::map<LoggingConfig::NameReductionFn, util::Any>, unsigned int>> &getAgents() const { return agents; }
+    const std::map<util::StringPair, std::pair<std::map<LoggingConfig::NameReductionFn, detail::Any>, unsigned int>> &getAgents() const { return agents; }
 
  private:
-    std::map<std::string, util::Any> environment;
-    std::map<util::StringPair, std::pair<std::map<LoggingConfig::NameReductionFn, util::Any>, unsigned int>> agents;
+    std::map<std::string, detail::Any> environment;
+    std::map<util::StringPair, std::pair<std::map<LoggingConfig::NameReductionFn, detail::Any>, unsigned int>> agents;
     unsigned int step_count;
 };
 
@@ -90,8 +90,8 @@ struct StepLogFrame : public LogFrame {
     /**
      * Creates a log with pre-populated data
      */
-    StepLogFrame(const std::map<std::string, util::Any>&& _environment,
-        const std::map<util::StringPair, std::pair<std::map<LoggingConfig::NameReductionFn, util::Any>, unsigned int>>&& _agents,
+    StepLogFrame(const std::map<std::string, detail::Any>&& _environment,
+        const std::map<util::StringPair, std::pair<std::map<LoggingConfig::NameReductionFn, detail::Any>, unsigned int>>&& _agents,
         unsigned int _step_count);
 
     /**
@@ -121,8 +121,8 @@ struct ExitLogFrame : public LogFrame {
     /**
      * Creates a log with pre-populated data
      */
-    ExitLogFrame(const std::map<std::string, util::Any>&& _environment,
-        const std::map<util::StringPair, std::pair<std::map<LoggingConfig::NameReductionFn, util::Any>, unsigned int>>&& _agents,
+    ExitLogFrame(const std::map<std::string, detail::Any>&& _environment,
+        const std::map<util::StringPair, std::pair<std::map<LoggingConfig::NameReductionFn, detail::Any>, unsigned int>>&& _agents,
         unsigned int _step_count);
 
     /**
@@ -267,7 +267,7 @@ struct AgentLogFrame {
      * @param data Map of reduction data
      * @param count Population size (alive agents)
      */
-    explicit AgentLogFrame(const std::map<LoggingConfig::NameReductionFn, util::Any> &data, unsigned int count);
+    explicit AgentLogFrame(const std::map<LoggingConfig::NameReductionFn, detail::Any> &data, unsigned int count);
     /**
      * Return the number of alive agents in the population
      * @return The population size
@@ -331,7 +331,7 @@ struct AgentLogFrame {
     /**
      * Logging data
      */
-    const std::map<LoggingConfig::NameReductionFn, util::Any> &data;
+    const std::map<LoggingConfig::NameReductionFn, detail::Any> &data;
     /**
      * Population size of the related agent state
      */
@@ -449,4 +449,4 @@ typename sum_input_t<T>::result_t AgentLogFrame::getSum(const std::string &varia
 
 }  // namespace flamegpu
 
-#endif  // INCLUDE_FLAMEGPU_SIM_LOGFRAME_H_
+#endif  // INCLUDE_FLAMEGPU_SIMULATION_LOGFRAME_H_
diff --git a/include/flamegpu/sim/LoggingConfig.h b/include/flamegpu/simulation/LoggingConfig.h
similarity index 94%
rename from include/flamegpu/sim/LoggingConfig.h
rename to include/flamegpu/simulation/LoggingConfig.h
index 6f31d60d8..81a7d92cf 100644
--- a/include/flamegpu/sim/LoggingConfig.h
+++ b/include/flamegpu/simulation/LoggingConfig.h
@@ -1,5 +1,5 @@
-#ifndef INCLUDE_FLAMEGPU_SIM_LOGGINGCONFIG_H_
-#define INCLUDE_FLAMEGPU_SIM_LOGGINGCONFIG_H_
+#ifndef INCLUDE_FLAMEGPU_SIMULATION_LOGGINGCONFIG_H_
+#define INCLUDE_FLAMEGPU_SIMULATION_LOGGINGCONFIG_H_
 
 #include <string>
 #include <map>
@@ -8,9 +8,9 @@
 #include <memory>
 
 #include "flamegpu/util/StringPair.h"
-#include "flamegpu/runtime/HostAgentAPI.cuh"
+#include "flamegpu/runtime/agent/HostAgentAPI.cuh"
 #include "flamegpu/model/ModelData.h"
-#include "flamegpu/gpu/CUDAEnsemble.h"
+#include "flamegpu/simulation/CUDAEnsemble.h"
 
 namespace flamegpu {
 
@@ -63,7 +63,7 @@ class LoggingConfig {
      * Typedef'ing function prototypes like this allows for cleaner function pointers
      * @note - this leads to a swig warning 504 which is suppressed.
      */
-    typedef util::Any (ReductionFn)(HostAgentAPI &ai, const std::string &variable_name);
+    typedef detail::Any (ReductionFn)(HostAgentAPI &ai, const std::string &variable_name);
     /**
      * A user configured reduction to be logged
      */
@@ -193,4 +193,4 @@ class StepLoggingConfig : public LoggingConfig {
 
 }  // namespace flamegpu
 
-#endif  // INCLUDE_FLAMEGPU_SIM_LOGGINGCONFIG_H_
+#endif  // INCLUDE_FLAMEGPU_SIMULATION_LOGGINGCONFIG_H_
diff --git a/include/flamegpu/sim/RunPlan.h b/include/flamegpu/simulation/RunPlan.h
similarity index 87%
rename from include/flamegpu/sim/RunPlan.h
rename to include/flamegpu/simulation/RunPlan.h
index 9c83a1234..e9d860fa0 100644
--- a/include/flamegpu/sim/RunPlan.h
+++ b/include/flamegpu/simulation/RunPlan.h
@@ -1,5 +1,5 @@
-#ifndef INCLUDE_FLAMEGPU_SIM_RUNPLAN_H_
-#define INCLUDE_FLAMEGPU_SIM_RUNPLAN_H_
+#ifndef INCLUDE_FLAMEGPU_SIMULATION_RUNPLAN_H_
+#define INCLUDE_FLAMEGPU_SIMULATION_RUNPLAN_H_
 
 #include <unordered_map>
 #include <string>
@@ -8,13 +8,15 @@
 #include <memory>
 
 #include "flamegpu/model/EnvironmentDescription.h"
-#include "flamegpu/util/Any.h"
-#include "flamegpu/util/type_decode.h"
-#include "flamegpu/gpu/CUDAEnsemble.h"
+#include "flamegpu/detail/Any.h"
+#include "flamegpu/detail/type_decode.h"
+#include "flamegpu/simulation/CUDAEnsemble.h"
 
 
 namespace flamegpu {
-
+namespace detail {
+class SimRunner;
+}
 class ModelDescription;
 class RunPlanVector;
 class CUDASimulation;
@@ -29,7 +31,7 @@ class XMLLogger;
  */
 class RunPlan {
     friend class RunPlanVector;
-    friend class SimRunner;
+    friend class detail::SimRunner;
     friend class CUDASimulation;
     friend class io::JSONLogger;
     friend class io::XMLLogger;
@@ -183,7 +185,7 @@ class RunPlan {
     uint64_t random_seed;
     unsigned int steps;
     std::string output_subdirectory;
-    std::unordered_map<std::string, util::Any> property_overrides;
+    std::unordered_map<std::string, detail::Any> property_overrides;
     /**
      * Reference to model environment data, for validation
      */
@@ -204,19 +206,19 @@ void RunPlan::setProperty(const std::string &name, T value) {
             "in RunPlan::setProperty()\n",
             name.c_str());
     }
-    if (it->second.data.type != std::type_index(typeid(typename type_decode<T>::type_t))) {
+    if (it->second.data.type != std::type_index(typeid(typename detail::type_decode<T>::type_t))) {
         THROW exception::InvalidEnvPropertyType("Environment property '%s' type mismatch '%s' != '%s', "
             "in RunPlan::setProperty()\n",
-            name.c_str(), it->second.data.type.name(), std::type_index(typeid(typename type_decode<T>::type_t)).name());
+            name.c_str(), it->second.data.type.name(), std::type_index(typeid(typename detail::type_decode<T>::type_t)).name());
     }
-    if (it->second.data.elements != type_decode<T>::len_t) {
+    if (it->second.data.elements != detail::type_decode<T>::len_t) {
         THROW exception::InvalidEnvPropertyType("Environment property '%s' is an array with %u elements, array method should be used, "
             "in RunPlan::setProperty()\n",
             name.c_str(), it->second.data.elements);
     }
     // Store property
     property_overrides.erase(name);
-    property_overrides.emplace(name, util::Any(&value, sizeof(T), typeid(typename type_decode<T>::type_t), type_decode<T>::len_t));
+    property_overrides.emplace(name, detail::Any(&value, sizeof(T), typeid(typename detail::type_decode<T>::type_t), detail::type_decode<T>::len_t));
 }
 template<typename T, flamegpu::size_type N>
 void RunPlan::setProperty(const std::string &name, const std::array<T, N> &value) {
@@ -227,19 +229,19 @@ void RunPlan::setProperty(const std::string &name, const std::array<T, N> &value
             "in RunPlan::setProperty()\n",
             name.c_str());
     }
-    if (it->second.data.type != std::type_index(typeid(typename type_decode<T>::type_t))) {
+    if (it->second.data.type != std::type_index(typeid(typename detail::type_decode<T>::type_t))) {
         THROW exception::InvalidEnvPropertyType("Environment property '%s' type mismatch '%s' != '%s', "
             "in RunPlan::setProperty()\n",
-            name.c_str(), it->second.data.type.name(), std::type_index(typeid(typename type_decode<T>::type_t)).name());
+            name.c_str(), it->second.data.type.name(), std::type_index(typeid(typename detail::type_decode<T>::type_t)).name());
     }
-    if (it->second.data.elements != N * type_decode<T>::len_t) {
+    if (it->second.data.elements != N * detail::type_decode<T>::len_t) {
         THROW exception::InvalidEnvPropertyType("Environment property array '%s' length mismatch %u != %u "
             "in RunPlan::setProperty()\n",
-            name.c_str(), it->second.data.elements, N * type_decode<T>::len_t);
+            name.c_str(), it->second.data.elements, N * detail::type_decode<T>::len_t);
     }
     // Store property
     property_overrides.erase(name);
-    property_overrides.emplace(name, util::Any(value.data(), sizeof(T) * N, typeid(typename type_decode<T>::type_t), type_decode<T>::len_t * N));
+    property_overrides.emplace(name, detail::Any(value.data(), sizeof(T) * N, typeid(typename detail::type_decode<T>::type_t), detail::type_decode<T>::len_t * N));
 }
 template<typename T, flamegpu::size_type N>
 void RunPlan::setProperty(const std::string &name, const flamegpu::size_type index, T value) {
@@ -250,17 +252,17 @@ void RunPlan::setProperty(const std::string &name, const flamegpu::size_type ind
             "in RunPlan::setProperty()\n",
             name.c_str());
     }
-    if (it->second.data.type != std::type_index(typeid(typename type_decode<T>::type_t))) {
+    if (it->second.data.type != std::type_index(typeid(typename detail::type_decode<T>::type_t))) {
         THROW exception::InvalidEnvPropertyType("Environment property '%s' type mismatch '%s' != '%s', "
             "in RunPlan::setProperty()\n",
-            name.c_str(), it->second.data.type.name(), std::type_index(typeid(typename type_decode<T>::type_t)).name());
+            name.c_str(), it->second.data.type.name(), std::type_index(typeid(typename detail::type_decode<T>::type_t)).name());
     }
     if (N && N != it->second.data.elements) {
         THROW exception::OutOfBoundsException("Environment property '%s' length mismatch '%u' != '%u', "
             "in RunPlan::setProperty()\n",
             name.c_str(), N, it->second.data.elements);
     }
-    const unsigned int t_index = type_decode<T>::len_t * index + type_decode<T>::len_t;
+    const unsigned int t_index = detail::type_decode<T>::len_t * index + detail::type_decode<T>::len_t;
     if (it->second.data.elements < t_index || t_index < index) {
         throw exception::OutOfBoundsException("Environment property array index out of bounds "
             "in RunPlan::setProperty()\n");
@@ -284,19 +286,19 @@ void RunPlan::setPropertyArray(const std::string &name, const std::vector<T> &va
             "in RunPlan::setPropertyArray()\n",
             name.c_str());
     }
-    if (it->second.data.type != std::type_index(typeid(typename type_decode<T>::type_t))) {
+    if (it->second.data.type != std::type_index(typeid(typename detail::type_decode<T>::type_t))) {
         THROW exception::InvalidEnvPropertyType("Environment property '%s' type mismatch '%s' != '%s', "
             "in RunPlan::setPropertyArray()\n",
-            name.c_str(), it->second.data.type.name(), std::type_index(typeid(typename type_decode<T>::type_t)).name());
+            name.c_str(), it->second.data.type.name(), std::type_index(typeid(typename detail::type_decode<T>::type_t)).name());
     }
-    if (type_decode<T>::len_t * value.size() != it->second.data.elements) {
+    if (detail::type_decode<T>::len_t * value.size() != it->second.data.elements) {
         THROW exception::InvalidEnvPropertyType("Environment property array length does not match the value provided, %u != %llu,"
             "in RunPlan::setPropertyArray()\n",
-            name.c_str(), type_decode<T>::len_t * value.size(), it->second.data.elements);
+            name.c_str(), detail::type_decode<T>::len_t * value.size(), it->second.data.elements);
     }
     // Store property
     property_overrides.erase(name);
-    property_overrides.emplace(name, util::Any(value.data(), sizeof(T) * value.size(), typeid(typename type_decode<T>::type_t), type_decode<T>::len_t * value.size()));
+    property_overrides.emplace(name, detail::Any(value.data(), sizeof(T) * value.size(), typeid(typename detail::type_decode<T>::type_t), detail::type_decode<T>::len_t * value.size()));
 }
 #endif
 
@@ -309,12 +311,12 @@ T RunPlan::getProperty(const std::string &name) const {
             "in RunPlan::getProperty()\n",
             name.c_str());
     }
-    if (it->second.data.type != std::type_index(typeid(typename type_decode<T>::type_t))) {
+    if (it->second.data.type != std::type_index(typeid(typename detail::type_decode<T>::type_t))) {
         THROW exception::InvalidEnvPropertyType("Environment property '%s' type mismatch '%s' != '%s', "
             "in RunPlan::getProperty()\n",
-            name.c_str(), it->second.data.type.name(), std::type_index(typeid(typename type_decode<T>::type_t)).name());
+            name.c_str(), it->second.data.type.name(), std::type_index(typeid(typename detail::type_decode<T>::type_t)).name());
     }
-    if (it->second.data.elements != type_decode<T>::len_t) {
+    if (it->second.data.elements != detail::type_decode<T>::len_t) {
         THROW exception::InvalidEnvPropertyType("Environment property '%s' is an array with %u elements, array method should be used, "
             "in RunPlan::getProperty()\n",
             name.c_str(), it->second.data.elements);
@@ -338,12 +340,12 @@ std::array<T, N> RunPlan::getProperty(const std::string &name) const {
             "in RunPlan::getProperty()\n",
             name.c_str());
     }
-    if (it->second.data.type != std::type_index(typeid(typename type_decode<T>::type_t))) {
+    if (it->second.data.type != std::type_index(typeid(typename detail::type_decode<T>::type_t))) {
         THROW exception::InvalidEnvPropertyType("Environment property '%s' type mismatch '%s' != '%s', "
             "in RunPlan::getProperty()\n",
-            name.c_str(), it->second.data.type.name(), std::type_index(typeid(typename type_decode<T>::type_t)).name());
+            name.c_str(), it->second.data.type.name(), std::type_index(typeid(typename detail::type_decode<T>::type_t)).name());
     }
-    if (it->second.data.elements != N * type_decode<T>::len_t) {
+    if (it->second.data.elements != N * detail::type_decode<T>::len_t) {
         THROW exception::InvalidEnvPropertyType("Environment property array '%s' length mismatch %u != %u "
             "in RunPlan::getProperty()\n",
             name.c_str(), it->second.data.elements, N);
@@ -369,17 +371,17 @@ T RunPlan::getProperty(const std::string &name, const flamegpu::size_type index)
             "in RunPlan::getProperty()\n",
             name.c_str());
     }
-    if (it->second.data.type != std::type_index(typeid(typename type_decode<T>::type_t))) {
+    if (it->second.data.type != std::type_index(typeid(typename detail::type_decode<T>::type_t))) {
         THROW exception::InvalidEnvPropertyType("Environment property '%s' type mismatch '%s' != '%s', "
             "in RunPlan::getProperty()\n",
-            name.c_str(), it->second.data.type.name(), std::type_index(typeid(typename type_decode<T>::type_t)).name());
+            name.c_str(), it->second.data.type.name(), std::type_index(typeid(typename detail::type_decode<T>::type_t)).name());
     }
     if (N && N != it->second.data.elements) {
         THROW exception::OutOfBoundsException("Environment property '%s' length mismatch '%u' != '%u', "
             "in RunPlan::getProperty()\n",
             name.c_str(), N, it->second.data.elements);
     }
-    const unsigned int t_index = type_decode<T>::len_t * index + type_decode<T>::len_t;
+    const unsigned int t_index = detail::type_decode<T>::len_t * index + detail::type_decode<T>::len_t;
     if (it->second.data.elements < t_index || t_index < index) {
         throw exception::OutOfBoundsException("Environment property array index out of bounds "
             "in RunPlan::getProperty()\n");
@@ -410,19 +412,19 @@ std::vector<T> RunPlan::getPropertyArray(const std::string &name) {
             "in RunPlan::getProperty()\n",
             name.c_str());
     }
-    if (it->second.data.type != std::type_index(typeid(typename type_decode<T>::type_t))) {
+    if (it->second.data.type != std::type_index(typeid(typename detail::type_decode<T>::type_t))) {
         THROW exception::InvalidEnvPropertyType("Environment property '%s' type mismatch '%s' != '%s', "
             "in RunPlan::getProperty()\n",
-            name.c_str(), it->second.data.type.name(), std::type_index(typeid(typename type_decode<T>::type_t)).name());
+            name.c_str(), it->second.data.type.name(), std::type_index(typeid(typename detail::type_decode<T>::type_t)).name());
     }
-    if (it->second.data.elements % type_decode<T>::len_t != 0) {
+    if (it->second.data.elements % detail::type_decode<T>::len_t != 0) {
         THROW exception::InvalidEnvPropertyType("Environmental property array '%s' length (%u) is not a multiple of vector length (%u), "
             "in RunPlan::getPropertyArray().",
-             name.c_str(), type_decode<T>::len_t, it->second.data.elements, type_decode<T>::len_t);
+             name.c_str(), detail::type_decode<T>::len_t, it->second.data.elements, detail::type_decode<T>::len_t);
     }
     // Check whether array already exists in property overrides
     const auto it2 = property_overrides.find(name);
-    std::vector<T> rtn(it->second.data.elements / type_decode<T>::len_t);
+    std::vector<T> rtn(it->second.data.elements / detail::type_decode<T>::len_t);
     if (it2 != property_overrides.end()) {
         // The property has been overridden, return the override
         memcpy(rtn.data(), it2->second.ptr, it2->second.length);
@@ -436,4 +438,4 @@ std::vector<T> RunPlan::getPropertyArray(const std::string &name) {
 
 }  // namespace flamegpu
 
-#endif  // INCLUDE_FLAMEGPU_SIM_RUNPLAN_H_
+#endif  // INCLUDE_FLAMEGPU_SIMULATION_RUNPLAN_H_
diff --git a/include/flamegpu/sim/RunPlanVector.h b/include/flamegpu/simulation/RunPlanVector.h
similarity index 93%
rename from include/flamegpu/sim/RunPlanVector.h
rename to include/flamegpu/simulation/RunPlanVector.h
index c9d95a287..5de85d547 100644
--- a/include/flamegpu/sim/RunPlanVector.h
+++ b/include/flamegpu/simulation/RunPlanVector.h
@@ -1,5 +1,5 @@
-#ifndef INCLUDE_FLAMEGPU_SIM_RUNPLANVECTOR_H_
-#define INCLUDE_FLAMEGPU_SIM_RUNPLANVECTOR_H_
+#ifndef INCLUDE_FLAMEGPU_SIMULATION_RUNPLANVECTOR_H_
+#define INCLUDE_FLAMEGPU_SIMULATION_RUNPLANVECTOR_H_
 
 #include <random>
 #include <vector>
@@ -8,9 +8,9 @@
 #include <memory>
 #include <limits>
 
-#include "flamegpu/sim/RunPlan.h"
-#include "flamegpu/util/detail/StaticAssert.h"
-#include "flamegpu/util/type_decode.h"
+#include "flamegpu/simulation/RunPlan.h"
+#include "flamegpu/detail/StaticAssert.h"
+#include "flamegpu/detail/type_decode.h"
 #include "flamegpu/model/EnvironmentData.h"
 
 
@@ -25,7 +25,7 @@ class EnvironmentDescription;
  */
 class RunPlanVector : private std::vector<RunPlan>  {
     friend class RunPlan;
-    friend class SimRunner;
+    friend class detail::SimRunner;
     friend unsigned int CUDAEnsemble::simulate(const RunPlanVector &plans);
 
  public:
@@ -318,12 +318,12 @@ void RunPlanVector::setProperty(const std::string &name, const T value) {
             "in RunPlanVector::setProperty()\n",
             name.c_str());
     }
-    if (it->second.data.type != std::type_index(typeid(typename type_decode<T>::type_t))) {
+    if (it->second.data.type != std::type_index(typeid(typename detail::type_decode<T>::type_t))) {
         THROW exception::InvalidEnvPropertyType("Environment property '%s' type mismatch '%s' != '%s', "
             "in RunPlanVector::setProperty()\n",
-            name.c_str(), it->second.data.type.name(), std::type_index(typeid(typename type_decode<T>::type_t)).name());
+            name.c_str(), it->second.data.type.name(), std::type_index(typeid(typename detail::type_decode<T>::type_t)).name());
     }
-    if (it->second.data.elements != type_decode<T>::len_t) {
+    if (it->second.data.elements != detail::type_decode<T>::len_t) {
         THROW exception::InvalidEnvPropertyType("Environment property '%s' is an array with %u elements, array method should be used, "
             "in RunPlanVector::setProperty()\n",
             name.c_str(), it->second.data.elements);
@@ -341,15 +341,15 @@ void RunPlanVector::setProperty(const std::string &name, const std::array<T, N>
             "in RunPlanVector::setProperty()\n",
             name.c_str());
     }
-    if (it->second.data.type != std::type_index(typeid(typename type_decode<T>::type_t))) {
+    if (it->second.data.type != std::type_index(typeid(typename detail::type_decode<T>::type_t))) {
         THROW exception::InvalidEnvPropertyType("Environment property '%s' type mismatch '%s' != '%s', "
             "in RunPlanVector::setProperty()\n",
-            name.c_str(), it->second.data.type.name(), std::type_index(typeid(typename type_decode<T>::type_t)).name());
+            name.c_str(), it->second.data.type.name(), std::type_index(typeid(typename detail::type_decode<T>::type_t)).name());
     }
-    if (it->second.data.elements != N * type_decode<T>::len_t) {
+    if (it->second.data.elements != N * detail::type_decode<T>::len_t) {
         THROW exception::InvalidEnvPropertyType("Environment property array '%s' length mismatch %u != %u "
             "in RunPlanVector::setProperty()\n",
-            name.c_str(), it->second.data.elements, N * type_decode<T>::len_t);
+            name.c_str(), it->second.data.elements, N * detail::type_decode<T>::len_t);
     }
     for (auto &i : *this) {
         i.setProperty<T, N>(name, value);
@@ -364,12 +364,12 @@ void RunPlanVector::setProperty(const std::string &name, const flamegpu::size_ty
             "in RunPlanVector::setProperty()\n",
             name.c_str());
     }
-    if (it->second.data.type != std::type_index(typeid(typename type_decode<T>::type_t))) {
+    if (it->second.data.type != std::type_index(typeid(typename detail::type_decode<T>::type_t))) {
         THROW exception::InvalidEnvPropertyType("Environment property '%s' type mismatch '%s' != '%s', "
             "in RunPlanVector::setProperty()\n",
-            name.c_str(), it->second.data.type.name(), std::type_index(typeid(typename type_decode<T>::type_t)).name());
+            name.c_str(), it->second.data.type.name(), std::type_index(typeid(typename detail::type_decode<T>::type_t)).name());
     }
-    const unsigned int t_index = type_decode<T>::len_t * index + type_decode<T>::len_t;
+    const unsigned int t_index = detail::type_decode<T>::len_t * index + detail::type_decode<T>::len_t;
     if (t_index > it->second.data.elements || t_index < index) {
         throw exception::OutOfBoundsException("Environment property array index out of bounds "
             "in RunPlanVector::setProperty()\n");
@@ -388,15 +388,15 @@ void RunPlanVector::setPropertyArray(const std::string &name, const std::vector<
             "in RunPlanVector::setPropertyArray()\n",
             name.c_str());
     }
-    if (it->second.data.type != std::type_index(typeid(typename type_decode<T>::type_t))) {
+    if (it->second.data.type != std::type_index(typeid(typename detail::type_decode<T>::type_t))) {
         THROW exception::InvalidEnvPropertyType("Environment property '%s' type mismatch '%s' != '%s', "
             "in RunPlanVector::setPropertyArray()\n",
-            name.c_str(), it->second.data.type.name(), std::type_index(typeid(typename type_decode<T>::type_t)).name());
+            name.c_str(), it->second.data.type.name(), std::type_index(typeid(typename detail::type_decode<T>::type_t)).name());
     }
-    if (value.size() * type_decode<T>::len_t != it->second.data.elements) {
+    if (value.size() * detail::type_decode<T>::len_t != it->second.data.elements) {
         THROW exception::InvalidEnvProperty("Environment property array length does not match the value provided, %u != %llu,"
             "in RunPlanVector::setPropertyArray()\n",
-            name.c_str(), value.size() * type_decode<T>::len_t, it->second.data.elements);
+            name.c_str(), value.size() * detail::type_decode<T>::len_t, it->second.data.elements);
     }
     for (auto &i : *this) {
         i.setPropertyArray<T>(name, value);
@@ -455,7 +455,7 @@ void RunPlanVector::setPropertyLerpRange(const std::string &name, const flamegpu
             "in RunPlanVector::setPropertyLerpRange()\n",
             name.c_str(), it->second.data.type.name(), std::type_index(typeid(T)).name());
     }
-    const unsigned int t_index = type_decode<T>::len_t * index + type_decode<T>::len_t;
+    const unsigned int t_index = detail::type_decode<T>::len_t * index + detail::type_decode<T>::len_t;
     if (t_index > it->second.data.elements || t_index < index) {
         throw exception::OutOfBoundsException("Environment property array index out of bounds "
             "in RunPlanVector::setPropertyLerpRange()\n");
@@ -516,7 +516,7 @@ void RunPlanVector::setPropertyRandom(const std::string &name, const flamegpu::s
             "in RunPlanVector::setPropertyRandom()\n",
             name.c_str(), it->second.data.type.name(), std::type_index(typeid(T)).name());
     }
-    const unsigned int t_index = type_decode<T>::len_t * index + type_decode<T>::len_t;
+    const unsigned int t_index = detail::type_decode<T>::len_t * index + detail::type_decode<T>::len_t;
     if (t_index > it->second.data.elements || t_index < index) {
         throw exception::OutOfBoundsException("Environment property array index out of bounds "
             "in RunPlanVector::setPropertyRandom()\n");
@@ -530,39 +530,39 @@ void RunPlanVector::setPropertyRandom(const std::string &name, const flamegpu::s
  */
 template<typename T>
 void RunPlanVector::setPropertyUniformRandom(const std::string &name, const T min, const T max) {
-    static_assert(util::detail::StaticAssert::_Is_IntType<T>::value, "Invalid template argument for RunPlanVector::setPropertyUniformRandom(const std::string &name, T min, T max)");
+    static_assert(detail::StaticAssert::_Is_IntType<T>::value, "Invalid template argument for RunPlanVector::setPropertyUniformRandom(const std::string &name, T min, T max)");
     std::uniform_int_distribution<T> dist(min, max);
     setPropertyRandom<T>(name, dist);
 }
 template<typename T>
 void RunPlanVector::setPropertyUniformRandom(const std::string &name, const flamegpu::size_type index, const T min, const T max) {
-    static_assert(util::detail::StaticAssert::_Is_IntType<T>::value, "Invalid template argument for RunPlanVector::setPropertyUniformRandom(const std::string &name, flamegpu::size_type index, T min, T max)");
+    static_assert(detail::StaticAssert::_Is_IntType<T>::value, "Invalid template argument for RunPlanVector::setPropertyUniformRandom(const std::string &name, flamegpu::size_type index, T min, T max)");
     std::uniform_int_distribution<T> dist(min, max);
     setPropertyRandom<T>(name, index, dist);
 }
 template<typename T>
 void RunPlanVector::setPropertyNormalRandom(const std::string &name, const T mean, const T stddev) {
-    static_assert(util::detail::StaticAssert::_Is_RealType<T>::value, "Invalid template argument for RunPlanVector::setPropertyNormalRandom(const std::string &name, T mean, T stddev)");
+    static_assert(detail::StaticAssert::_Is_RealType<T>::value, "Invalid template argument for RunPlanVector::setPropertyNormalRandom(const std::string &name, T mean, T stddev)");
     std::normal_distribution<T> dist(mean, stddev);
     setPropertyRandom<T>(name, dist);
 }
 template<typename T>
 void RunPlanVector::setPropertyNormalRandom(const std::string &name, const flamegpu::size_type index, const T mean, const T stddev) {
-    static_assert(util::detail::StaticAssert::_Is_RealType<T>::value,
+    static_assert(detail::StaticAssert::_Is_RealType<T>::value,
         "Invalid template argument for RunPlanVector::setPropertyNormalRandom(const std::string &name, flamegpu::size_type index, T mean, T stddev)");
     std::normal_distribution<T> dist(mean, stddev);
     setPropertyRandom<T>(name, index, dist);
 }
 template<typename T>
 void RunPlanVector::setPropertyLogNormalRandom(const std::string &name, const T mean, const T stddev) {
-    static_assert(util::detail::StaticAssert::_Is_RealType<T>::value,
+    static_assert(detail::StaticAssert::_Is_RealType<T>::value,
     "Invalid template argument for RunPlanVector::setPropertyLogNormalRandom(const std::string &name, T mean, T stddev)");
     std::lognormal_distribution<T> dist(mean, stddev);
     setPropertyRandom<T>(name, dist);
 }
 template<typename T>
 void RunPlanVector::setPropertyLogNormalRandom(const std::string &name, const flamegpu::size_type index, const T mean, const T stddev) {
-    static_assert(util::detail::StaticAssert::_Is_RealType<T>::value,
+    static_assert(detail::StaticAssert::_Is_RealType<T>::value,
     "Invalid template argument for RunPlanVector::setPropertyLogNormalRandom(const std::string &name, flamegpu::size_type index, T mean, T stddev)");
     std::lognormal_distribution<T> dist(mean, stddev);
     setPropertyRandom<T>(name, index, dist);
@@ -625,4 +625,4 @@ inline void RunPlanVector::setPropertyUniformRandom(const std::string &name, con
 
 }  // namespace flamegpu
 
-#endif  // INCLUDE_FLAMEGPU_SIM_RUNPLANVECTOR_H_
+#endif  // INCLUDE_FLAMEGPU_SIMULATION_RUNPLANVECTOR_H_
diff --git a/include/flamegpu/sim/Simulation.h b/include/flamegpu/simulation/Simulation.h
similarity index 93%
rename from include/flamegpu/sim/Simulation.h
rename to include/flamegpu/simulation/Simulation.h
index 2f0e456e0..cbdc069c2 100644
--- a/include/flamegpu/sim/Simulation.h
+++ b/include/flamegpu/simulation/Simulation.h
@@ -1,5 +1,5 @@
-#ifndef INCLUDE_FLAMEGPU_SIM_SIMULATION_H_
-#define INCLUDE_FLAMEGPU_SIM_SIMULATION_H_
+#ifndef INCLUDE_FLAMEGPU_SIMULATION_SIMULATION_H_
+#define INCLUDE_FLAMEGPU_SIMULATION_SIMULATION_H_
 
 #include <memory>
 #include <string>
@@ -8,18 +8,19 @@
 #include <unordered_map>
 
 #include "flamegpu/defines.h"
-#include "flamegpu/sim/AgentInterface.h"
-#include "flamegpu/util/Any.h"
+#include "flamegpu/simulation/detail/AgentInterface.h"
+#include "flamegpu/detail/Any.h"
 
 namespace flamegpu {
-
+namespace detail {
+class EnvironmentManager;
+}  // namespace detail
 class AgentVector;
 class HostAPI;
 class ModelDescription;
 struct ModelData;
 struct RunLog;
 class CUDASimulation;
-class EnvironmentManager;
 
 
 /**
@@ -122,7 +123,7 @@ class Simulation {
     virtual void getPopulationData(AgentVector& population, const std::string& state_name = ModelData::DEFAULT_STATE) = 0;
 
     virtual const RunLog &getRunLog() const = 0;
-    virtual AgentInterface &getAgent(const std::string &name) = 0;
+    virtual detail::AgentInterface &getAgent(const std::string &name) = 0;
 
     Config &SimulationConfig();
     const Config &getSimulationConfig() const;
@@ -149,7 +150,7 @@ class Simulation {
     /**
      * Returns the environment manager
      */
-    virtual std::shared_ptr<EnvironmentManager> getEnvironment() const = 0;
+    virtual std::shared_ptr<detail::EnvironmentManager> getEnvironment() const = 0;
 
     /**
      * returns the width of the widest layer in model.
@@ -181,7 +182,7 @@ class Simulation {
     /**
      * Initial environment items if they have been loaded from file, prior to device selection
      */
-    std::unordered_map<std::string, util::Any> env_init;
+    std::unordered_map<std::string, detail::Any> env_init;
     /**
      * the width of the widest layer in the concrete version of the model (calculated once)
      */
@@ -198,4 +199,4 @@ class Simulation {
 
 }  // namespace flamegpu
 
-#endif  // INCLUDE_FLAMEGPU_SIM_SIMULATION_H_
+#endif  // INCLUDE_FLAMEGPU_SIMULATION_SIMULATION_H_
diff --git a/include/flamegpu/sim/AgentInterface.h b/include/flamegpu/simulation/detail/AgentInterface.h
similarity index 90%
rename from include/flamegpu/sim/AgentInterface.h
rename to include/flamegpu/simulation/detail/AgentInterface.h
index 1d26fe3c4..41b200d13 100644
--- a/include/flamegpu/sim/AgentInterface.h
+++ b/include/flamegpu/simulation/detail/AgentInterface.h
@@ -1,5 +1,5 @@
-#ifndef INCLUDE_FLAMEGPU_SIM_AGENTINTERFACE_H_
-#define INCLUDE_FLAMEGPU_SIM_AGENTINTERFACE_H_
+#ifndef INCLUDE_FLAMEGPU_SIMULATION_DETAIL_AGENTINTERFACE_H_
+#define INCLUDE_FLAMEGPU_SIMULATION_DETAIL_AGENTINTERFACE_H_
 
 #include <string>
 #include <memory>
@@ -11,6 +11,7 @@
 namespace flamegpu {
 class DeviceAgentVector_impl;
 class CAgentDescription;
+namespace detail {
 /**
  * Base-class (interface) for classes like CUDAAgent, which provide access to agent data
  */
@@ -50,6 +51,7 @@ class AgentInterface {
     virtual void resetPopulationVecs() = 0;
 };
 
+}  // namespace detail
 }  // namespace flamegpu
 
-#endif  // INCLUDE_FLAMEGPU_SIM_AGENTINTERFACE_H_
+#endif  // INCLUDE_FLAMEGPU_SIMULATION_DETAIL_AGENTINTERFACE_H_
diff --git a/include/flamegpu/gpu/CUDAAgent.h b/include/flamegpu/simulation/detail/CUDAAgent.h
similarity index 94%
rename from include/flamegpu/gpu/CUDAAgent.h
rename to include/flamegpu/simulation/detail/CUDAAgent.h
index 37e3dd358..ae802f0e9 100644
--- a/include/flamegpu/gpu/CUDAAgent.h
+++ b/include/flamegpu/simulation/detail/CUDAAgent.h
@@ -1,5 +1,5 @@
-#ifndef INCLUDE_FLAMEGPU_GPU_CUDAAGENT_H_
-#define INCLUDE_FLAMEGPU_GPU_CUDAAGENT_H_
+#ifndef INCLUDE_FLAMEGPU_SIMULATION_DETAIL_CUDAAGENT_H_
+#define INCLUDE_FLAMEGPU_SIMULATION_DETAIL_CUDAAGENT_H_
 
 #include <memory>
 #include <map>
@@ -10,20 +10,21 @@
 #include <list>
 
 // include sub classes
-#include "flamegpu/util/detail/JitifyCache.h"
-#include "flamegpu/gpu/CUDAAgentStateList.h"
+#include "flamegpu/detail/JitifyCache.h"
+#include "flamegpu/simulation/detail/CUDAAgentStateList.h"
 #include "flamegpu/model/AgentFunctionData.cuh"
 #include "flamegpu/model/SubAgentData.h"
 #include "flamegpu/runtime/detail/curve/curve_rtc.cuh"
-#include "flamegpu/sim/AgentInterface.h"
-#include "flamegpu/runtime/utility/EnvironmentManager.cuh"
+#include "flamegpu/simulation/detail/AgentInterface.h"
+#include "flamegpu/simulation/detail/EnvironmentManager.cuh"
 
 namespace flamegpu {
+class HostAPI;
+struct VarOffsetStruct;
+namespace detail {
 class CUDAMacroEnvironment;
 class CUDAScatter;
 class CUDAFatAgent;
-struct VarOffsetStruct;
-class HostAPI;
 /**
  * This is the regular CUDAAgent
  * It provides access to the device buffers representing the states of a particular agent
@@ -82,7 +83,7 @@ class CUDAAgent : public AgentInterface {
      * @param stream CUDA stream to be used for async CUDA operations
      * @note Scatter is required for initialising submodel vars
      */
-    void setPopulationData(const AgentVector& population, const std::string &state_name, CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream);
+    void setPopulationData(const AgentVector& population, const std::string &state_name, detail::CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream);
     /**
      * Copies population data the device buffers held by this object
      * To the hosts object (overwriting any existing agent data)
@@ -119,7 +120,7 @@ class CUDAAgent : public AgentInterface {
      * @param stream CUDA stream to be used for async CUDA operations
      * @see CUDAFatAgent::processDeath(unsigned int, const std::string &, unsigned int)
      */
-    void processDeath(const AgentFunctionData& func, CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream);
+    void processDeath(const AgentFunctionData& func, detail::CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream);
     /**
      * Transitions all active agents from the source state to the destination state
      * @param _src The source state
@@ -129,7 +130,7 @@ class CUDAAgent : public AgentInterface {
      * @param stream CUDA stream to be used for async CUDA operations
      * @see CUDAFatAgent::transitionState(unsigned int, const std::string &, const std::string &, unsigned int)
      */
-    void transitionState(const std::string &_src, const std::string &_dest, CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream);
+    void transitionState(const std::string &_src, const std::string &_dest, detail::CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream);
     /**
      * Scatters agents based on their output of the agent function condition
      * Agents which failed the condition are scattered to the front and marked as disabled
@@ -142,7 +143,7 @@ class CUDAAgent : public AgentInterface {
      * @note Named state must not already contain disabled agents
      * @note The disabled agents are re-enabled using clearFunctionCondition(const std::string &)
      */
-    void processFunctionCondition(const AgentFunctionData& func, CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream);
+    void processFunctionCondition(const AgentFunctionData& func, detail::CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream);
     /**
      * Scatters agents from the provided device buffer, this is used for host agent creation
      * The device buffer must be packed according to the param offsets
@@ -154,7 +155,7 @@ class CUDAAgent : public AgentInterface {
      * @param streamId The stream index to use for accessing stream specific resources such as scan compaction arrays and buffers
      * @param stream CUDA stream to be used for async CUDA operations
      */
-    void scatterHostCreation(const std::string &state_name, unsigned int newSize, char *const d_inBuff, const VarOffsetStruct &offsets, CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream);
+    void scatterHostCreation(const std::string &state_name, unsigned int newSize, char *const d_inBuff, const VarOffsetStruct &offsets, detail::CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream);
     /**
      * Sorts all agent variables according to the positions stored inside Message Output scan buffer
      * @param state_name The state agents are scattered into
@@ -163,7 +164,7 @@ class CUDAAgent : public AgentInterface {
      * @param stream CUDA stream to be used for async CUDA operations
      * @see HostAgentAPI::sort(const std::string &, HostAgentAPI::Order, int, int)
      */
-    void scatterSort_async(const std::string &state_name, CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream);
+    void scatterSort_async(const std::string &state_name, detail::CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream);
     /**
      * Allocates a buffer for storing new agents into and
      * uses the cuRVE runtime to map variables for use with an agent function that has device agent birth
@@ -176,7 +177,7 @@ class CUDAAgent : public AgentInterface {
      * @param streamId The stream index to use for accessing stream specific resources such as scan compaction arrays and buffers
      * @note This method is async, the stream used it not synchronised
      */
-    void mapNewRuntimeVariables_async(const CUDAAgent& func_agent, const AgentFunctionData& func, unsigned int maxLen, CUDAScatter &scatter, unsigned int instance_id, cudaStream_t stream, unsigned int streamId);
+    void mapNewRuntimeVariables_async(const CUDAAgent& func_agent, const AgentFunctionData& func, unsigned int maxLen, detail::CUDAScatter &scatter, unsigned int instance_id, cudaStream_t stream, unsigned int streamId);
     /**
      * Releases the buffer that was storing new age data
      * @param func The function.
@@ -191,7 +192,7 @@ class CUDAAgent : public AgentInterface {
      * @param streamId The stream index to use for accessing stream specific resources such as scan compaction arrays and buffers
      * @param stream CUDA stream to be used for async CUDA operations
      */
-    void scatterNew(const AgentFunctionData& func, unsigned int newSize, CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream);
+    void scatterNew(const AgentFunctionData& func, unsigned int newSize, detail::CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream);
     /**
      * Reenables all disabled agents within the named state
      * @param state The named state to enable all agents within
@@ -397,6 +398,7 @@ class CUDAAgent : public AgentInterface {
     std::map<std::string, std::shared_ptr<DeviceAgentVector_impl>> population_dvec;
 };
 
+}  // namespace detail
 }  // namespace flamegpu
 
-#endif  // INCLUDE_FLAMEGPU_GPU_CUDAAGENT_H_
+#endif  // INCLUDE_FLAMEGPU_SIMULATION_DETAIL_CUDAAGENT_H_
diff --git a/include/flamegpu/gpu/CUDAAgentStateList.h b/include/flamegpu/simulation/detail/CUDAAgentStateList.h
similarity index 91%
rename from include/flamegpu/gpu/CUDAAgentStateList.h
rename to include/flamegpu/simulation/detail/CUDAAgentStateList.h
index 2c13b3c40..0fa39bcf8 100644
--- a/include/flamegpu/gpu/CUDAAgentStateList.h
+++ b/include/flamegpu/simulation/detail/CUDAAgentStateList.h
@@ -1,17 +1,17 @@
-#ifndef INCLUDE_FLAMEGPU_GPU_CUDAAGENTSTATELIST_H_
-#define INCLUDE_FLAMEGPU_GPU_CUDAAGENTSTATELIST_H_
+#ifndef INCLUDE_FLAMEGPU_SIMULATION_DETAIL_CUDAAGENTSTATELIST_H_
+#define INCLUDE_FLAMEGPU_SIMULATION_DETAIL_CUDAAGENTSTATELIST_H_
 
 #include <string>
 #include <memory>
 #include <map>
 #include <list>
 
-#include "flamegpu/gpu/CUDAFatAgentStateList.h"
+#include "flamegpu/simulation/detail/CUDAFatAgentStateList.h"
 
 namespace flamegpu {
-
-class CUDAScatter;
 struct VarOffsetStruct;
+namespace detail {
+class CUDAScatter;
 class CUDAAgent;
 
 /**
@@ -78,7 +78,7 @@ class CUDAAgentStateList {
      * @param streamId The stream index to use for accessing stream specific resources such as scan compaction arrays and buffers
      * @param stream CUDA stream to be used for async CUDA operations
      */
-    void setAgentData(const AgentVector &data, CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream);
+    void setAgentData(const AgentVector &data, detail::CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream);
     /**
      * Retrieve agent data from the agent state list into agent state memory
      * @param data data Destination for agent data
@@ -95,14 +95,14 @@ class CUDAAgentStateList {
      * @param streamId The stream index to use for accessing stream specific resources such as scan compaction arrays and buffers
      * @param stream CUDA stream to be used for async CUDA operations
      */
-    void scatterHostCreation(unsigned int newSize, char *const d_inBuff, const VarOffsetStruct &offsets, CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream);
+    void scatterHostCreation(unsigned int newSize, char *const d_inBuff, const VarOffsetStruct &offsets, detail::CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream);
     /**
      * Sorts all agent variables according to the positions stored inside Message Output scan buffer
      * @param scatter Scatter instance and scan arrays to be used (CUDASimulation::singletons->scatter)
      * @param streamId The stream index to use for accessing stream specific resources such as scan compaction arrays and buffers
      * @param stream CUDA stream to be used for async CUDA operations
      */
-    void scatterSort_async(CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream);
+    void scatterSort_async(detail::CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream);
     /**
      * Scatters agents from the currently assigned device agent birth buffer (see member variable newBuffs)
      * The device buffer must be packed in the same format as CUDAAgent::mapNewRuntimeVariables(const AgentFunctionData&, unsigned int, unsigned int)
@@ -113,7 +113,7 @@ class CUDAAgentStateList {
      * @param stream CUDA stream to be used for async CUDA operations
      * @return The number of newly birthed agents
      */
-    unsigned int scatterNew(void * d_newBuff, unsigned int newSize, CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream);
+    unsigned int scatterNew(void * d_newBuff, unsigned int newSize, detail::CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream);
     /**
      * Returns true if the state list is not the primary statelist (and is mapped to a master agent state)
      */
@@ -124,7 +124,7 @@ class CUDAAgentStateList {
      * @param streamId The stream index to use for accessing stream specific resources such as scan compaction arrays and buffers
      * @param stream CUDA stream to be used for async CUDA operations
      */
-    void initUnmappedVars(CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream);
+    void initUnmappedVars(detail::CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream);
     /**
      * Initialises any agent variables within the CUDAFatAgentStateList which are not present in this CUDAAgentStateList
      * @param count Number of variables to init
@@ -180,6 +180,7 @@ class CUDAAgentStateList {
     std::list<std::shared_ptr<VariableBuffer>> unmappedBuffers;
 };
 
+}  // namespace detail
 }  // namespace flamegpu
 
-#endif  // INCLUDE_FLAMEGPU_GPU_CUDAAGENTSTATELIST_H_
+#endif  // INCLUDE_FLAMEGPU_SIMULATION_DETAIL_CUDAAGENTSTATELIST_H_
diff --git a/include/flamegpu/gpu/detail/CUDAErrorChecking.cuh b/include/flamegpu/simulation/detail/CUDAErrorChecking.cuh
similarity index 93%
rename from include/flamegpu/gpu/detail/CUDAErrorChecking.cuh
rename to include/flamegpu/simulation/detail/CUDAErrorChecking.cuh
index fa7a22de5..968d7897f 100644
--- a/include/flamegpu/gpu/detail/CUDAErrorChecking.cuh
+++ b/include/flamegpu/simulation/detail/CUDAErrorChecking.cuh
@@ -1,5 +1,5 @@
-#ifndef INCLUDE_FLAMEGPU_GPU_DETAIL_CUDAERRORCHECKING_CUH_
-#define INCLUDE_FLAMEGPU_GPU_DETAIL_CUDAERRORCHECKING_CUH_
+#ifndef INCLUDE_FLAMEGPU_SIMULATION_DETAIL_CUDAERRORCHECKING_CUH_
+#define INCLUDE_FLAMEGPU_SIMULATION_DETAIL_CUDAERRORCHECKING_CUH_
 
 #include <cuda.h>
 #include <cuda_runtime.h>
@@ -72,4 +72,4 @@ inline void gpuLaunchAssert(const char *file, int line) {
 }  // namespace detail
 }  // namespace flamegpu
 
-#endif  // INCLUDE_FLAMEGPU_GPU_DETAIL_CUDAERRORCHECKING_CUH_
+#endif  // INCLUDE_FLAMEGPU_SIMULATION_DETAIL_CUDAERRORCHECKING_CUH_
diff --git a/include/flamegpu/gpu/CUDAFatAgent.h b/include/flamegpu/simulation/detail/CUDAFatAgent.h
similarity index 94%
rename from include/flamegpu/gpu/CUDAFatAgent.h
rename to include/flamegpu/simulation/detail/CUDAFatAgent.h
index 0e460ce4f..51fa6140c 100644
--- a/include/flamegpu/gpu/CUDAFatAgent.h
+++ b/include/flamegpu/simulation/detail/CUDAFatAgent.h
@@ -1,5 +1,5 @@
-#ifndef INCLUDE_FLAMEGPU_GPU_CUDAFATAGENT_H_
-#define INCLUDE_FLAMEGPU_GPU_CUDAFATAGENT_H_
+#ifndef INCLUDE_FLAMEGPU_SIMULATION_DETAIL_CUDAFATAGENT_H_
+#define INCLUDE_FLAMEGPU_SIMULATION_DETAIL_CUDAFATAGENT_H_
 
 #include <memory>
 #include <unordered_map>
@@ -8,13 +8,14 @@
 #include <string>
 
 
-#include "flamegpu/gpu/CUDAAgentStateList.h"
-#include "flamegpu/gpu/CUDAFatAgentStateList.h"
+#include "flamegpu/simulation/detail/CUDAAgentStateList.h"
+#include "flamegpu/simulation/detail/CUDAFatAgentStateList.h"
 #include "flamegpu/model/SubAgentData.h"
 
 namespace flamegpu {
-
 class HostAPI;
+namespace detail {
+
 
 /**
  * This is a shared CUDAFatAgent
@@ -95,7 +96,7 @@ class CUDAFatAgent {
      * @param streamId The stream index to use for accessing stream specific resources such as scan compaction arrays and buffers
      * @param stream CUDA stream to be used for async CUDA operations
      */
-    void processDeath(unsigned int agent_fat_id, const std::string &state_name, CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream);
+    void processDeath(unsigned int agent_fat_id, const std::string &state_name, detail::CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream);
     /**
      * Transitions all active agents from the source state to the destination state
      * @param agent_fat_id The index of the CUDAAgent within this CUDAFatAgent
@@ -105,7 +106,7 @@ class CUDAFatAgent {
      * @param streamId The stream index to use for accessing stream specific resources such as scan compaction arrays and buffers
      * @param stream CUDA stream to be used for async CUDA operations
      */
-    void transitionState(unsigned int agent_fat_id, const std::string &_src, const std::string &_dest, CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream);
+    void transitionState(unsigned int agent_fat_id, const std::string &_src, const std::string &_dest, detail::CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream);
     /**
      * Reads the flags set by an agent function condition in order to sort agents according to whether they passed or failed
      * Failed agents are sorted to the front and marked as disabled, passing agents are then sorted to the back
@@ -115,7 +116,7 @@ class CUDAFatAgent {
      * @param streamId The stream index to use for accessing stream specific resources such as scan compaction arrays and buffers
      * @param stream CUDA stream to be used for async CUDA operations
      */
-    void processFunctionCondition(unsigned int agent_fat_id, const std::string &state_name, CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream);
+    void processFunctionCondition(unsigned int agent_fat_id, const std::string &state_name, detail::CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream);
     /**
      * Marks the specified number of agents within the specified statelist as disabled
      * @param agent_fat_id The index of the CUDAAgent within this CUDAFatAgent
@@ -234,6 +235,7 @@ class CUDAFatAgent {
     bool agent_ids_have_init = true;
 };
 
+}  // namespace detail
 }  // namespace flamegpu
 
-#endif  // INCLUDE_FLAMEGPU_GPU_CUDAFATAGENT_H_
+#endif  // INCLUDE_FLAMEGPU_SIMULATION_DETAIL_CUDAFATAGENT_H_
diff --git a/include/flamegpu/gpu/CUDAFatAgentStateList.h b/include/flamegpu/simulation/detail/CUDAFatAgentStateList.h
similarity index 93%
rename from include/flamegpu/gpu/CUDAFatAgentStateList.h
rename to include/flamegpu/simulation/detail/CUDAFatAgentStateList.h
index 5da4f1e95..b91f16c56 100644
--- a/include/flamegpu/gpu/CUDAFatAgentStateList.h
+++ b/include/flamegpu/simulation/detail/CUDAFatAgentStateList.h
@@ -1,5 +1,5 @@
-#ifndef INCLUDE_FLAMEGPU_GPU_CUDAFATAGENTSTATELIST_H_
-#define INCLUDE_FLAMEGPU_GPU_CUDAFATAGENTSTATELIST_H_
+#ifndef INCLUDE_FLAMEGPU_SIMULATION_DETAIL_CUDAFATAGENTSTATELIST_H_
+#define INCLUDE_FLAMEGPU_SIMULATION_DETAIL_CUDAFATAGENTSTATELIST_H_
 
 #include <memory>
 #include <list>
@@ -12,6 +12,7 @@
 #include "flamegpu/model/SubAgentData.h"
 
 namespace flamegpu {
+namespace detail {
 
 class CUDAScatter;
 
@@ -21,8 +22,8 @@ class CUDAScatter;
  * This is required, as two mapped agents might have variables of the same name which are not mapped
  */
 struct AgentVariable{
-    const unsigned int agent;
-    const std::string variable;
+    const unsigned int agent{};
+    const std::string variable{};
     /**
      * Basic comparison operator, required for use of std::map etc
      */
@@ -35,7 +36,7 @@ struct AgentVariable{
  * Hash function so that AgentVariable can be used as a key in a map.
  */
 struct AgentVariableHash {
-    std::size_t operator()(const flamegpu::AgentVariable& k) const noexcept {
+    std::size_t operator()(const AgentVariable& k) const noexcept {
         return ((std::hash<unsigned int>()(k.agent)
             ^ (std::hash<std::string>()(k.variable) << 1)) >> 1);
     }
@@ -201,7 +202,7 @@ class CUDAFatAgentStateList {
      * @param stream CUDA stream to be used for async CUDA operations
      * @return The number of agents that are still alive (this includes temporarily disabled agents due to agent function condition)
      */
-    unsigned int scatterDeath(CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream);
+    unsigned int scatterDeath(detail::CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream);
     /**
      * Scatters all living agents which failed the agent function condition into the swap buffer (there should be no disabled at this time)
      * This does not swap buffers or update disabledAgent)
@@ -211,7 +212,7 @@ class CUDAFatAgentStateList {
      * @return The number of agents that were scattered (the number of agents which failed the condition)
      * @see scatterAgentFunctionConditionTrue(unsigned int, unsigned int)
      */
-    unsigned int scatterAgentFunctionConditionFalse(CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream);
+    unsigned int scatterAgentFunctionConditionFalse(detail::CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream);
     /**
      * Scatters all living agents which passed the agent function condition into the swap buffer (there should be no disabled at this time)
      * Also swaps the buffers and sets the number of disabled agents
@@ -223,14 +224,14 @@ class CUDAFatAgentStateList {
      * @see scatterAgentFunctionConditionFalse(unsigned int)
      * @see setConditionState(unsigned int)
      */
-    unsigned int scatterAgentFunctionConditionTrue(unsigned int conditionFailCount, CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream);
+    unsigned int scatterAgentFunctionConditionTrue(unsigned int conditionFailCount, detail::CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream);
     /**
      * Sorts all agent variables according to the positions stored inside Message Output scan buffer
      * @param scatter Scatter instance and scan arrays to be used (CUDASimulation::singletons->scatter)
      * @param streamId The stream index to use for accessing stream specific resources such as scan compaction arrays and buffers
      * @param stream CUDA stream to be used for async CUDA operations
      */
-    void scatterSort_async(CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream);
+    void scatterSort_async(detail::CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream);
     /**
      * Set the number of disabled agents within the state list
      * Updates member var disabledAgents and data_condition for every item inside variables_unique
@@ -248,7 +249,7 @@ class CUDAFatAgentStateList {
      * @param streamId The stream index to use for accessing stream specific resources such as scan compaction arrays and buffers
      * @param stream CUDA stream to be used for async CUDA operations
      */
-    void initVariables(std::set<std::shared_ptr<VariableBuffer>> &exclusionSet, const unsigned int initCount, const unsigned initOffset, CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream);
+    void initVariables(std::set<std::shared_ptr<VariableBuffer>> &exclusionSet, const unsigned int initCount, const unsigned initOffset, detail::CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream);
     /**
      * Returns the collection of unique variable buffers held by this CUDAFatAgentStateList
      */
@@ -292,6 +293,7 @@ class CUDAFatAgentStateList {
     std::list<std::shared_ptr<VariableBuffer>> variables_unique;
 };
 
+}  // namespace detail
 }  // namespace flamegpu
 
-#endif  // INCLUDE_FLAMEGPU_GPU_CUDAFATAGENTSTATELIST_H_
+#endif  // INCLUDE_FLAMEGPU_SIMULATION_DETAIL_CUDAFATAGENTSTATELIST_H_
diff --git a/include/flamegpu/gpu/CUDAMacroEnvironment.h b/include/flamegpu/simulation/detail/CUDAMacroEnvironment.h
similarity index 95%
rename from include/flamegpu/gpu/CUDAMacroEnvironment.h
rename to include/flamegpu/simulation/detail/CUDAMacroEnvironment.h
index 4f8e2df95..1864751db 100644
--- a/include/flamegpu/gpu/CUDAMacroEnvironment.h
+++ b/include/flamegpu/simulation/detail/CUDAMacroEnvironment.h
@@ -1,5 +1,5 @@
-#ifndef INCLUDE_FLAMEGPU_GPU_CUDAMACROENVIRONMENT_H_
-#define INCLUDE_FLAMEGPU_GPU_CUDAMACROENVIRONMENT_H_
+#ifndef INCLUDE_FLAMEGPU_SIMULATION_DETAIL_CUDAMACROENVIRONMENT_H_
+#define INCLUDE_FLAMEGPU_SIMULATION_DETAIL_CUDAMACROENVIRONMENT_H_
 
 #include <cuda_runtime.h>
 
@@ -11,24 +11,23 @@
 #include <vector>
 #include <memory>
 
-#include "detail/CUDAErrorChecking.cuh"
+#include "flamegpu/simulation/detail/CUDAErrorChecking.cuh"
 #include "flamegpu/runtime/detail/curve/HostCurve.cuh"
-#include "flamegpu/runtime/utility/HostMacroProperty.cuh"
-#include "flamegpu/util/detail/cuda.cuh"
+#include "flamegpu/runtime/environment/HostMacroProperty.cuh"
+#include "flamegpu/detail/cuda.cuh"
 
 // forward declare classes from other modules
 
 namespace flamegpu {
+struct SubEnvironmentData;
+struct AgentFunctionData;
+struct EnvironmentData;
+class CUDASimulation;
 namespace detail {
 namespace curve {
 class CurveRTCHost;
 }  // namespace curve
-}  // namespace detail
 
-struct SubEnvironmentData;
-struct AgentFunctionData;
-struct EnvironmentData;
-class CUDASimulation;
 
 /**
  * This class is CUDASimulation's internal handler for macro environment functionality
@@ -51,7 +50,7 @@ class CUDAMacroEnvironment {
             , is_sub(false) { }
         ~MacroEnvProp() {
             if (d_ptr && !is_sub) {
-                gpuErrchk(flamegpu::util::detail::cuda::cudaFree(d_ptr));
+                gpuErrchk(flamegpu::detail::cuda::cudaFree(d_ptr));
             }
         }
         MacroEnvProp(const MacroEnvProp& other) = delete;
@@ -243,6 +242,7 @@ HostMacroProperty_swig<T> CUDAMacroEnvironment::getProperty_swig(const std::stri
 }
 #endif
 
+}  // namespace detail
 }  // namespace flamegpu
 
-#endif  // INCLUDE_FLAMEGPU_GPU_CUDAMACROENVIRONMENT_H_
+#endif  // INCLUDE_FLAMEGPU_SIMULATION_DETAIL_CUDAMACROENVIRONMENT_H_
diff --git a/include/flamegpu/gpu/CUDAMessage.h b/include/flamegpu/simulation/detail/CUDAMessage.h
similarity index 91%
rename from include/flamegpu/gpu/CUDAMessage.h
rename to include/flamegpu/simulation/detail/CUDAMessage.h
index 12d0fe7aa..2164ecc79 100644
--- a/include/flamegpu/gpu/CUDAMessage.h
+++ b/include/flamegpu/simulation/detail/CUDAMessage.h
@@ -1,30 +1,28 @@
-#ifndef INCLUDE_FLAMEGPU_GPU_CUDAMESSAGE_H_
-#define INCLUDE_FLAMEGPU_GPU_CUDAMESSAGE_H_
+#ifndef INCLUDE_FLAMEGPU_SIMULATION_DETAIL_CUDAMESSAGE_H_
+#define INCLUDE_FLAMEGPU_SIMULATION_DETAIL_CUDAMESSAGE_H_
 
 #include <memory>
 #include <utility>
 #include <string>
 
 // include sub classes
-#include "flamegpu/gpu/CUDAMessageList.h"
+#include "flamegpu/simulation/detail/CUDAMessageList.h"
 #include "flamegpu/runtime/messaging/MessageBruteForce/MessageBruteForceHost.h"
 
 // forward declare classes from other modules
 
 namespace flamegpu {
-
-class CUDAScatter;
 class CUDASimulation;
 struct AgentFunctionData;
 struct MessageData;
+class MessageSpecialisationHandler;
 namespace detail {
+class CUDAScatter;
+class CUDAAgent;
 namespace curve {
 class HostCurve;
 class Curve;
 }  // namespace curve
-}  // namespace detail
-class MessageSpecialisationHandler;
-class CUDAAgent;
 /**
  * This class is CUDASimulation's internal handler for message functionality
  */
@@ -66,7 +64,7 @@ class CUDAMessage {
      * @param streamId Index of stream specific structures used
      * @param stream The CUDAStream to use for CUDA operations
      */
-    void init(CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream);
+    void init(detail::CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream);
     /**
      * Updates message_count to equal newSize, internally reallocates buffer space if more space is required
      * @param newSize The number of messages that the buffer should be capable of storing
@@ -75,7 +73,7 @@ class CUDAMessage {
      * @param streamId Index of stream specific structures used
      * @param keepLen Number of existing messages worth of data to retain through the resize
      */
-    void resize(unsigned int newSize, CUDAScatter &scatter, cudaStream_t stream, unsigned int streamId, unsigned int keepLen = 0);
+    void resize(unsigned int newSize, detail::CUDAScatter &scatter, cudaStream_t stream, unsigned int streamId, unsigned int keepLen = 0);
     /**
      * Uses the cuRVE runtime to map the variables used by the agent function to the cuRVE library so that can be accessed by name within a n agent function
      * The read runtime variables are to be used when reading messages
@@ -105,7 +103,7 @@ class CUDAMessage {
      * @param streamId Index of stream specific structures used
      * @throw exception::InvalidCudaMessage If this is called before the internal buffers have been allocated
      */
-    void swap(bool isOptional, unsigned int newMessageCount, CUDAScatter &scatter, cudaStream_t stream, unsigned int streamId);
+    void swap(bool isOptional, unsigned int newMessageCount, detail::CUDAScatter &scatter, cudaStream_t stream, unsigned int streamId);
     /**
      * Basic list swap with no additional actions
      */
@@ -122,7 +120,7 @@ class CUDAMessage {
      * @param streamId The stream index to use for accessing stream specific resources such as scan compaction arrays and buffers
      * @param stream CUDA stream to be used for async CUDA operations
      */
-    void buildIndex(CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream);
+    void buildIndex(detail::CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream);
     const void *getMetaDataDevicePtr() const;
 
  protected:
@@ -177,6 +175,7 @@ class CUDAMessage {
     const CUDASimulation& cudaSimulation;
 };
 
+}  // namespace detail
 }  // namespace flamegpu
 
-#endif  // INCLUDE_FLAMEGPU_GPU_CUDAMESSAGE_H_
+#endif  // INCLUDE_FLAMEGPU_SIMULATION_DETAIL_CUDAMESSAGE_H_
diff --git a/include/flamegpu/gpu/CUDAMessageList.h b/include/flamegpu/simulation/detail/CUDAMessageList.h
similarity index 88%
rename from include/flamegpu/gpu/CUDAMessageList.h
rename to include/flamegpu/simulation/detail/CUDAMessageList.h
index 3edc1d9c5..ed80ded3a 100644
--- a/include/flamegpu/gpu/CUDAMessageList.h
+++ b/include/flamegpu/simulation/detail/CUDAMessageList.h
@@ -1,11 +1,12 @@
-#ifndef INCLUDE_FLAMEGPU_GPU_CUDAMESSAGELIST_H_
-#define INCLUDE_FLAMEGPU_GPU_CUDAMESSAGELIST_H_
+#ifndef INCLUDE_FLAMEGPU_SIMULATION_DETAIL_CUDAMESSAGELIST_H_
+#define INCLUDE_FLAMEGPU_SIMULATION_DETAIL_CUDAMESSAGELIST_H_
 
 #include <string>
 #include <map>
 #include <utility>
 
 namespace flamegpu {
+namespace detail {
 
 class CUDAScatter;
 class CUDAMessage;
@@ -28,7 +29,7 @@ class CUDAMessageList {
      /**
       * Initially allocates message lists based on cuda_message.getMaximumListSize()
       */
-    explicit CUDAMessageList(CUDAMessage& cuda_message, CUDAScatter &scatter, cudaStream_t stream, unsigned int streamId);
+    explicit CUDAMessageList(CUDAMessage& cuda_message, detail::CUDAScatter &scatter, cudaStream_t stream, unsigned int streamId);
     /**
      * Frees all message list memory
      */
@@ -78,7 +79,7 @@ class CUDAMessageList {
      * @param append If true scattered messages will append to the existing message list, otherwise truncate
      * @return Total number of messages now in list (includes old + new counts if appending)
      */
-    virtual unsigned int scatter(unsigned int newCount, CUDAScatter &scatter, cudaStream_t stream, unsigned int streamId, bool append);
+    virtual unsigned int scatter(unsigned int newCount, detail::CUDAScatter &scatter, cudaStream_t stream, unsigned int streamId, bool append);
     /**
      * Copy all message data from d_swap_list to d_list
      * This ALWAYS performs and append to the existing message list count
@@ -89,7 +90,7 @@ class CUDAMessageList {
      * @param streamId The stream index to use for accessing stream specific resources such as scan compaction arrays and buffers
      * @return Total number of messages now in list (includes old + new counts)
      */
-    virtual unsigned int scatterAll(unsigned int newCount, CUDAScatter &scatter, cudaStream_t stream, unsigned int streamId);
+    virtual unsigned int scatterAll(unsigned int newCount, detail::CUDAScatter &scatter, cudaStream_t stream, unsigned int streamId);
     /**
      * @return Returns the map<variable_name, device_ptr> for reading message data
      */
@@ -133,6 +134,7 @@ class CUDAMessageList {
     const CUDAMessage& message;
 };
 
+}  // namespace detail
 }  // namespace flamegpu
 
-#endif  // INCLUDE_FLAMEGPU_GPU_CUDAMESSAGELIST_H_
+#endif  // INCLUDE_FLAMEGPU_SIMULATION_DETAIL_CUDAMESSAGELIST_H_
diff --git a/include/flamegpu/gpu/CUDAScanCompaction.h b/include/flamegpu/simulation/detail/CUDAScanCompaction.h
similarity index 95%
rename from include/flamegpu/gpu/CUDAScanCompaction.h
rename to include/flamegpu/simulation/detail/CUDAScanCompaction.h
index 4b0c58d10..a0816520e 100644
--- a/include/flamegpu/gpu/CUDAScanCompaction.h
+++ b/include/flamegpu/simulation/detail/CUDAScanCompaction.h
@@ -1,11 +1,11 @@
-#ifndef INCLUDE_FLAMEGPU_GPU_CUDASCANCOMPACTION_H_
-#define INCLUDE_FLAMEGPU_GPU_CUDASCANCOMPACTION_H_
+#ifndef INCLUDE_FLAMEGPU_SIMULATION_DETAIL_CUDASCANCOMPACTION_H_
+#define INCLUDE_FLAMEGPU_SIMULATION_DETAIL_CUDASCANCOMPACTION_H_
 #include <driver_types.h>
 
 namespace flamegpu {
-
 // forward declare classes from other modules
 class CUDASimulation;
+namespace detail {
 
 /**
  * A pair of device buffers for performing scan/compaction operations
@@ -136,6 +136,7 @@ class CUDAScanCompaction {
     CUDAScanCompactionConfig configs[MAX_TYPES][MAX_STREAMS];
 };
 
+}  // namespace detail
 }  // namespace flamegpu
 
-#endif  // INCLUDE_FLAMEGPU_GPU_CUDASCANCOMPACTION_H_
+#endif  // INCLUDE_FLAMEGPU_SIMULATION_DETAIL_CUDASCANCOMPACTION_H_
diff --git a/include/flamegpu/gpu/CUDAScatter.cuh b/include/flamegpu/simulation/detail/CUDAScatter.cuh
similarity index 97%
rename from include/flamegpu/gpu/CUDAScatter.cuh
rename to include/flamegpu/simulation/detail/CUDAScatter.cuh
index 870c35eba..4ab7b21e9 100644
--- a/include/flamegpu/gpu/CUDAScatter.cuh
+++ b/include/flamegpu/simulation/detail/CUDAScatter.cuh
@@ -1,5 +1,5 @@
-#ifndef INCLUDE_FLAMEGPU_GPU_CUDASCATTER_CUH_
-#define INCLUDE_FLAMEGPU_GPU_CUDASCATTER_CUH_
+#ifndef INCLUDE_FLAMEGPU_SIMULATION_DETAIL_CUDASCATTER_CUH_
+#define INCLUDE_FLAMEGPU_SIMULATION_DETAIL_CUDASCATTER_CUH_
 
 #include <map>
 #include <string>
@@ -9,12 +9,12 @@
 #include <vector>
 
 #include "flamegpu/model/Variable.h"
-#include "flamegpu/gpu/detail/CubTemporaryMemory.cuh"
-#include "flamegpu/gpu/CUDAScanCompaction.h"
+#include "flamegpu/simulation/detail/CubTemporaryMemory.cuh"
+#include "flamegpu/simulation/detail/CUDAScanCompaction.h"
 
 namespace flamegpu {
-
 struct VarOffsetStruct;
+namespace detail {
 struct VariableBuffer;
 
 /**
@@ -310,6 +310,7 @@ class CUDAScatter {
     void operator=(CUDAScatter const&) = delete;
 };
 
+}  // namespace detail
 }  // namespace flamegpu
 
-#endif  // INCLUDE_FLAMEGPU_GPU_CUDASCATTER_CUH_
+#endif  // INCLUDE_FLAMEGPU_SIMULATION_DETAIL_CUDASCATTER_CUH_
diff --git a/include/flamegpu/gpu/detail/CubTemporaryMemory.cuh b/include/flamegpu/simulation/detail/CubTemporaryMemory.cuh
similarity index 77%
rename from include/flamegpu/gpu/detail/CubTemporaryMemory.cuh
rename to include/flamegpu/simulation/detail/CubTemporaryMemory.cuh
index 1e049c7dc..9483cfc77 100644
--- a/include/flamegpu/gpu/detail/CubTemporaryMemory.cuh
+++ b/include/flamegpu/simulation/detail/CubTemporaryMemory.cuh
@@ -1,5 +1,5 @@
-#ifndef INCLUDE_FLAMEGPU_GPU_DETAIL_CUBTEMPORARYMEMORY_CUH_
-#define INCLUDE_FLAMEGPU_GPU_DETAIL_CUBTEMPORARYMEMORY_CUH_
+#ifndef INCLUDE_FLAMEGPU_SIMULATION_DETAIL_CUBTEMPORARYMEMORY_CUH_
+#define INCLUDE_FLAMEGPU_SIMULATION_DETAIL_CUBTEMPORARYMEMORY_CUH_
 
 #include <unordered_map>
 #include <utility>
@@ -27,4 +27,4 @@ class CubTemporaryMemory {
 }  // namespace detail
 }  // namespace flamegpu
 
-#endif  // INCLUDE_FLAMEGPU_GPU_DETAIL_CUBTEMPORARYMEMORY_CUH_
+#endif  // INCLUDE_FLAMEGPU_SIMULATION_DETAIL_CUBTEMPORARYMEMORY_CUH_
diff --git a/include/flamegpu/runtime/utility/EnvironmentManager.cuh b/include/flamegpu/simulation/detail/EnvironmentManager.cuh
similarity index 92%
rename from include/flamegpu/runtime/utility/EnvironmentManager.cuh
rename to include/flamegpu/simulation/detail/EnvironmentManager.cuh
index 6410bafb8..524829cb4 100644
--- a/include/flamegpu/runtime/utility/EnvironmentManager.cuh
+++ b/include/flamegpu/simulation/detail/EnvironmentManager.cuh
@@ -1,5 +1,5 @@
-#ifndef INCLUDE_FLAMEGPU_RUNTIME_UTILITY_ENVIRONMENTMANAGER_CUH_
-#define INCLUDE_FLAMEGPU_RUNTIME_UTILITY_ENVIRONMENTMANAGER_CUH_
+#ifndef INCLUDE_FLAMEGPU_SIMULATION_DETAIL_ENVIRONMENTMANAGER_CUH_
+#define INCLUDE_FLAMEGPU_SIMULATION_DETAIL_ENVIRONMENTMANAGER_CUH_
 
 #include <cuda_runtime.h>
 
@@ -15,13 +15,14 @@
 #include "flamegpu/defines.h"
 #include "flamegpu/exception/FLAMEGPUException.h"
 #include "flamegpu/runtime/detail/curve/HostCurve.cuh"
-#include "flamegpu/util/type_decode.h"
-#include "flamegpu/util/Any.h"
+#include "flamegpu/detail/type_decode.h"
+#include "flamegpu/detail/Any.h"
 
 namespace flamegpu {
 struct SubEnvironmentData;
 struct EnvironmentData;
 class CUDASimulation;
+namespace detail {
 
 /**
  * This class manages the regular (not macro) environment properties for a single simulation instance
@@ -36,7 +37,7 @@ class EnvironmentManager : public std::enable_shared_from_this<EnvironmentManage
      * CUDASimulation::initEnvironmentMgr() requires access to EnvironmentManager::properties and EnvironmentManager::setPropertyDirect()
      * The latter could probably be moved into EnvironmentManager (behind a private method)
      */
-    friend class CUDASimulation;
+    friend class flamegpu::CUDASimulation;
 
  private:
     /**
@@ -327,7 +328,7 @@ class EnvironmentManager : public std::enable_shared_from_this<EnvironmentManage
      * Returns the named property as an any generic
      * @param property_name Name of the variable to be returned
      */
-    util::Any getPropertyAny(const std::string &property_name) const;
+    detail::Any getPropertyAny(const std::string &property_name) const;
     /**
      * Host copy of the device memory pointed to by d_buffer
      */
@@ -364,18 +365,18 @@ template<typename T>
 const EnvironmentManager::EnvProp& EnvironmentManager::findProperty(const std::string& property_name, const bool setter, const size_type length) const {
     // Limited to Arithmetic types
     // Compound types would allow host pointers inside structs to be passed
-    static_assert(std::is_arithmetic<typename type_decode<T>::type_t>::value || std::is_enum<typename type_decode<T>::type_t>::value || std::is_void<typename type_decode<T>::type_t>::value,
+    static_assert(std::is_arithmetic<typename detail::type_decode<T>::type_t>::value || std::is_enum<typename detail::type_decode<T>::type_t>::value || std::is_void<typename detail::type_decode<T>::type_t>::value,
         "Only arithmetic types can be used as environmental properties");
     const auto a = properties.find(property_name);
     if (a != properties.end()) {
-        if (std::type_index(typeid(T)) != std::type_index(typeid(void)) && a->second.type != std::type_index(typeid(typename type_decode<T>::type_t))) {
+        if (std::type_index(typeid(T)) != std::type_index(typeid(void)) && a->second.type != std::type_index(typeid(typename detail::type_decode<T>::type_t))) {
             THROW exception::InvalidEnvPropertyType("Environmental property with name '%s', type (%s) does not match template argument T (%s), "
                 "in EnvironmentManager::setProperty().",
-                property_name.c_str(), a->second.type.name(), typeid(typename type_decode<T>::type_t).name());
-        } else if (length && a->second.elements != type_decode<T>::len_t * length) {
+                property_name.c_str(), a->second.type.name(), typeid(typename detail::type_decode<T>::type_t).name());
+        } else if (length && a->second.elements != detail::type_decode<T>::len_t * length) {
             THROW exception::OutOfBoundsException("Environmental property with name '%s', base length (%u) does not match provided base length (%u), "
                 "in EnvironmentManager::setProperty().",
-                property_name.c_str(), a->second.elements, type_decode<T>::len_t * length);
+                property_name.c_str(), a->second.elements, detail::type_decode<T>::len_t * length);
         } else if (setter && a->second.isConst) {
             THROW exception::ReadOnlyEnvProperty("Environmental property with name '%s' is marked as const and cannot be changed, "
                 "in EnvironmentManager::setProperty().",
@@ -422,14 +423,14 @@ std::array<T, N> EnvironmentManager::setProperty(const std::string &name, const
 template<typename T, flamegpu::size_type N>
 T EnvironmentManager::setProperty(const std::string &name, const size_type index, const T value) {
     const EnvProp& prop = findProperty<T>(name, true, 0);
-    if (N && N != prop.elements / type_decode<T>::len_t) {
+    if (N && N != prop.elements / detail::type_decode<T>::len_t) {
         THROW exception::OutOfBoundsException("Environmental property with name '%s', array length mismatch (%u != %u), "
             "in EnvironmentManager::setProperty().",
-            name.c_str(), N, prop.elements / type_decode<T>::len_t);
-    } else if (index >= prop.elements / type_decode<T>::len_t) {
+            name.c_str(), N, prop.elements / detail::type_decode<T>::len_t);
+    } else if (index >= prop.elements / detail::type_decode<T>::len_t) {
         THROW exception::OutOfBoundsException("Environmental property with name '%s', index (%u) exceeds named environmental property array's length (%u), "
             "in EnvironmentManager::setProperty().",
-            name.c_str(), index, prop.elements / type_decode<T>::len_t);
+            name.c_str(), index, prop.elements / detail::type_decode<T>::len_t);
     }
     // Copy old data to return
     T rtn;
@@ -479,14 +480,14 @@ std::array<T, N> EnvironmentManager::getProperty(const std::string &name) {
 template<typename T, flamegpu::size_type N>
 T EnvironmentManager::getProperty(const std::string &name, const size_type index) {
     const EnvProp& prop = findProperty<T>(name, false, 0);
-    if (N && N != prop.elements / type_decode<T>::len_t) {
+    if (N && N != prop.elements / detail::type_decode<T>::len_t) {
         THROW exception::OutOfBoundsException("Environmental property with name '%s', array length mismatch (%u != %u), "
             "in EnvironmentManager::getProperty().",
-            name.c_str(), N, prop.elements / type_decode<T>::len_t);
-    } else if (index >= prop.elements / type_decode<T>::len_t) {
+            name.c_str(), N, prop.elements / detail::type_decode<T>::len_t);
+    } else if (index >= prop.elements / detail::type_decode<T>::len_t) {
         THROW exception::OutOfBoundsException("Environmental property with name '%s', index (%u) exceeds named environmental property array's length (%u), "
             "in EnvironmentManager::getProperty().",
-            name.c_str(), index, prop.elements / type_decode<T>::len_t);
+            name.c_str(), index, prop.elements / detail::type_decode<T>::len_t);
     }
     // Copy old data to return
     T rtn;
@@ -498,7 +499,7 @@ template<typename T>
 std::vector<T> EnvironmentManager::getPropertyArray(const std::string &name) {
     const EnvProp& prop = findProperty<T>(name, false, 0);
     // Copy old data to return
-    const unsigned int N = prop.elements / type_decode<T>::len_t;
+    const unsigned int N = prop.elements / detail::type_decode<T>::len_t;
     std::vector<T> rtn;
     rtn.resize(N);
     memcpy(rtn.data(), h_buffer + prop.offset, sizeof(T) * N);
@@ -506,6 +507,7 @@ std::vector<T> EnvironmentManager::getPropertyArray(const std::string &name) {
 }
 #endif
 
+}  // namespace detail
 }  // namespace flamegpu
 
-#endif  // INCLUDE_FLAMEGPU_RUNTIME_UTILITY_ENVIRONMENTMANAGER_CUH_
+#endif  // INCLUDE_FLAMEGPU_SIMULATION_DETAIL_ENVIRONMENTMANAGER_CUH_
diff --git a/include/flamegpu/pop/detail/GenericMemoryVector.h b/include/flamegpu/simulation/detail/GenericMemoryVector.h
similarity index 92%
rename from include/flamegpu/pop/detail/GenericMemoryVector.h
rename to include/flamegpu/simulation/detail/GenericMemoryVector.h
index 92a98dd2f..e120000e2 100644
--- a/include/flamegpu/pop/detail/GenericMemoryVector.h
+++ b/include/flamegpu/simulation/detail/GenericMemoryVector.h
@@ -1,5 +1,5 @@
-#ifndef INCLUDE_FLAMEGPU_POP_DETAIL_GENERICMEMORYVECTOR_H_
-#define INCLUDE_FLAMEGPU_POP_DETAIL_GENERICMEMORYVECTOR_H_
+#ifndef INCLUDE_FLAMEGPU_SIMULATION_DETAIL_GENERICMEMORYVECTOR_H_
+#define INCLUDE_FLAMEGPU_SIMULATION_DETAIL_GENERICMEMORYVECTOR_H_
 
 #include <vector>
 #include <typeindex>
@@ -78,4 +78,4 @@ typedef std::pair<const std::string, std::unique_ptr<GenericMemoryVector>> State
 }  // namespace detail
 }  // namespace flamegpu
 
-#endif  // INCLUDE_FLAMEGPU_POP_DETAIL_GENERICMEMORYVECTOR_H_
+#endif  // INCLUDE_FLAMEGPU_SIMULATION_DETAIL_GENERICMEMORYVECTOR_H_
diff --git a/include/flamegpu/pop/detail/MemoryVector.h b/include/flamegpu/simulation/detail/MemoryVector.h
similarity index 93%
rename from include/flamegpu/pop/detail/MemoryVector.h
rename to include/flamegpu/simulation/detail/MemoryVector.h
index f862ed9de..5f48827aa 100644
--- a/include/flamegpu/pop/detail/MemoryVector.h
+++ b/include/flamegpu/simulation/detail/MemoryVector.h
@@ -1,5 +1,5 @@
-#ifndef INCLUDE_FLAMEGPU_POP_DETAIL_MEMORYVECTOR_H_
-#define INCLUDE_FLAMEGPU_POP_DETAIL_MEMORYVECTOR_H_
+#ifndef INCLUDE_FLAMEGPU_SIMULATION_DETAIL_MEMORYVECTOR_H_
+#define INCLUDE_FLAMEGPU_SIMULATION_DETAIL_MEMORYVECTOR_H_
 
 #include <vector>
 #include <typeindex>
@@ -8,7 +8,7 @@
 #include <utility>
 #include <string>
 
-#include "flamegpu/pop/detail/GenericMemoryVector.h"
+#include "flamegpu/simulation/detail/GenericMemoryVector.h"
 #include "flamegpu/exception/FLAMEGPUException.h"
 
 namespace flamegpu {
@@ -126,4 +126,4 @@ class MemoryVector : public GenericMemoryVector {
 }  // namespace detail
 }  // namespace flamegpu
 
-#endif  // INCLUDE_FLAMEGPU_POP_DETAIL_MEMORYVECTOR_H_
+#endif  // INCLUDE_FLAMEGPU_SIMULATION_DETAIL_MEMORYVECTOR_H_
diff --git a/include/flamegpu/runtime/utility/RandomManager.cuh b/include/flamegpu/simulation/detail/RandomManager.cuh
similarity index 90%
rename from include/flamegpu/runtime/utility/RandomManager.cuh
rename to include/flamegpu/simulation/detail/RandomManager.cuh
index 668b293aa..53455b060 100644
--- a/include/flamegpu/runtime/utility/RandomManager.cuh
+++ b/include/flamegpu/simulation/detail/RandomManager.cuh
@@ -1,18 +1,19 @@
-#ifndef INCLUDE_FLAMEGPU_RUNTIME_UTILITY_RANDOMMANAGER_CUH_
-#define INCLUDE_FLAMEGPU_RUNTIME_UTILITY_RANDOMMANAGER_CUH_
+#ifndef INCLUDE_FLAMEGPU_SIMULATION_DETAIL_RANDOMMANAGER_CUH_
+#define INCLUDE_FLAMEGPU_SIMULATION_DETAIL_RANDOMMANAGER_CUH_
 
 #include <cstdint>
 #include <random>
 #include <string>
 
 #include "flamegpu/defines.h"
-#include "flamegpu/util/detail/curand.cuh"
-#include "flamegpu/sim/Simulation.h"
+#include "flamegpu/detail/curand.cuh"
+#include "flamegpu/simulation/Simulation.h"
 
 namespace flamegpu {
-
 // forward declare classes
 class CUDASimulation;
+namespace detail {
+
 
 /**
  * Singleton manager for initialising simulation wide random with a common seed
@@ -59,7 +60,7 @@ class RandomManager {
      *     while(length*shrinkModifier>_length)
      *       length*=shrinkModifier
      */
-    util::detail::curandState*resize(size_type _length, cudaStream_t stream);
+    detail::curandState*resize(size_type _length, cudaStream_t stream);
     /**
      * Accessors
      */
@@ -81,14 +82,14 @@ class RandomManager {
      */
     size_type size();
     uint64_t seed();
-    util::detail::curandState*cudaRandomState();
+    detail::curandState*cudaRandomState();
 
  private:
     /**
      * Device array holding curand states
      * They should always be initialised
      */
-     util::detail::curandState*d_random_state = nullptr;
+     detail::curandState*d_random_state = nullptr;
     /**
      * Random seed used to initialise all currently allocated curand states
      */
@@ -124,7 +125,7 @@ class RandomManager {
      * @note h_max_random_state will be allocated to length h_max_random_size
      * However, it will only be initialised from hd_random_size(aka length) onwards
      */
-    util::detail::curandState *h_max_random_state = nullptr;
+    detail::curandState *h_max_random_state = nullptr;
     /**
      * Allocated length of h_max_random_state
      */
@@ -179,6 +180,7 @@ T RandomManager::getDistribution(dist &distribution) {
     return distribution(host_rng);
 }
 
+}  // namespace detail
 }  // namespace flamegpu
 
-#endif  // INCLUDE_FLAMEGPU_RUNTIME_UTILITY_RANDOMMANAGER_CUH_
+#endif  // INCLUDE_FLAMEGPU_SIMULATION_DETAIL_RANDOMMANAGER_CUH_
diff --git a/include/flamegpu/sim/SimLogger.h b/include/flamegpu/simulation/detail/SimLogger.h
similarity index 90%
rename from include/flamegpu/sim/SimLogger.h
rename to include/flamegpu/simulation/detail/SimLogger.h
index 14b64db7f..339bacd76 100644
--- a/include/flamegpu/sim/SimLogger.h
+++ b/include/flamegpu/simulation/detail/SimLogger.h
@@ -1,5 +1,5 @@
-#ifndef INCLUDE_FLAMEGPU_SIM_SIMLOGGER_H_
-#define INCLUDE_FLAMEGPU_SIM_SIMLOGGER_H_
+#ifndef INCLUDE_FLAMEGPU_SIMULATION_DETAIL_SIMLOGGER_H_
+#define INCLUDE_FLAMEGPU_SIMULATION_DETAIL_SIMLOGGER_H_
 
 #include <vector>
 #include <thread>
@@ -8,17 +8,18 @@
 #include <string>
 #include <condition_variable>
 
-#include "flamegpu/sim/LogFrame.h"
+#include "flamegpu/simulation/LogFrame.h"
 
 namespace flamegpu {
-
 class RunPlanVector;
+class CUDAEnsemble;
+namespace detail {
 
 /**
  * This class is used by CUDAEnsemble::simulate() to collect logs generated by each of the SimRunner instances executing in different threads and write them to disk
  */
 class SimLogger {
-    friend class CUDAEnsemble;
+    friend class flamegpu::CUDAEnsemble;
     /**
      * Constructs a new SimLogger
      * @param run_logs Reference to the vector to store generate run logs
@@ -99,6 +100,7 @@ class SimLogger {
     bool export_exit_time;
 };
 
+}  // namespace detail
 }  // namespace flamegpu
 
-#endif  // INCLUDE_FLAMEGPU_SIM_SIMLOGGER_H_
+#endif  // INCLUDE_FLAMEGPU_SIMULATION_DETAIL_SIMLOGGER_H_
diff --git a/include/flamegpu/sim/SimRunner.h b/include/flamegpu/simulation/detail/SimRunner.h
similarity index 94%
rename from include/flamegpu/sim/SimRunner.h
rename to include/flamegpu/simulation/detail/SimRunner.h
index 9d00c733b..0b5f3626f 100644
--- a/include/flamegpu/sim/SimRunner.h
+++ b/include/flamegpu/simulation/detail/SimRunner.h
@@ -1,5 +1,5 @@
-#ifndef INCLUDE_FLAMEGPU_SIM_SIMRUNNER_H_
-#define INCLUDE_FLAMEGPU_SIM_SIMRUNNER_H_
+#ifndef INCLUDE_FLAMEGPU_SIMULATION_DETAIL_SIMRUNNER_H_
+#define INCLUDE_FLAMEGPU_SIMULATION_DETAIL_SIMRUNNER_H_
 
 #include <atomic>
 #include <memory>
@@ -11,14 +11,15 @@
 #include <string>
 
 #include "flamegpu/defines.h"
-#include "flamegpu/sim/LogFrame.h"
+#include "flamegpu/simulation/LogFrame.h"
 
 namespace flamegpu {
-
 struct ModelData;
 class LoggingConfig;
 class StepLoggingConfig;
 class RunPlanVector;
+class CUDAEnsemble;
+namespace detail {
 
 /**
  * A thread class which executes RunPlans on a single GPU
@@ -27,7 +28,7 @@ class RunPlanVector;
  * There may be multiple instances per GPU, if running small models on large GPUs.
  */
 class SimRunner {
-    friend class CUDAEnsemble;
+    friend class flamegpu::CUDAEnsemble;
     struct ErrorDetail {
         unsigned int run_id;
         unsigned int device_id;
@@ -149,6 +150,7 @@ class SimRunner {
     ErrorDetail& fast_err_detail;
 };
 
+}  // namespace detail
 }  // namespace flamegpu
 
-#endif  // INCLUDE_FLAMEGPU_SIM_SIMRUNNER_H_
+#endif  // INCLUDE_FLAMEGPU_SIMULATION_DETAIL_SIMRUNNER_H_
diff --git a/include/flamegpu/visualiser/AgentVis.h b/include/flamegpu/visualiser/AgentVis.h
index 99b90bda7..1402edc8c 100644
--- a/include/flamegpu/visualiser/AgentVis.h
+++ b/include/flamegpu/visualiser/AgentVis.h
@@ -16,7 +16,9 @@
 
 namespace flamegpu {
 struct AgentData;
+namespace detail {
 class CUDAAgent;
+}  // namespace detail
 
 namespace visualiser {
 struct Palette;
@@ -32,7 +34,7 @@ struct AgentVisData {
      * @note Agent states only receive colors from the autopalette when AgentVis::State() is called for each state
      * @note By default, all states share the same color from the autopalette
      */
-    explicit AgentVisData(CUDAAgent& agent, const std::shared_ptr<AutoPalette>& autopalette = nullptr);
+    explicit AgentVisData(detail::CUDAAgent& agent, const std::shared_ptr<AutoPalette>& autopalette = nullptr);
     /**
      * Link to the currently active auto_palette
      */
@@ -53,7 +55,7 @@ struct AgentVisData {
     /**
      * CUDAAgent being rendered
      */
-    CUDAAgent& agent;
+    detail::CUDAAgent& agent;
     /**
      * Agent description hierarchy being rendered
      */
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 40bcf2355..bc62ac9c6 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -144,7 +144,7 @@ SET(SRC_INCLUDE
     ${FLAMEGPU_ROOT}/include/flamegpu/io/LoggerFactory.h
     ${FLAMEGPU_ROOT}/include/flamegpu/io/XMLLogger.h
     ${FLAMEGPU_ROOT}/include/flamegpu/io/JSONLogger.h
-	${FLAMEGPU_ROOT}/include/flamegpu/io/Telemetry.h
+    ${FLAMEGPU_ROOT}/include/flamegpu/io/Telemetry.h
     ${FLAMEGPU_ROOT}/include/flamegpu/exception/FLAMEGPUException.h
     ${FLAMEGPU_ROOT}/include/flamegpu/exception/FLAMEGPUDeviceException.cuh
     ${FLAMEGPU_ROOT}/include/flamegpu/exception/FLAMEGPUDeviceException_device.cuh
@@ -168,37 +168,36 @@ SET(SRC_INCLUDE
     ${FLAMEGPU_ROOT}/include/flamegpu/model/SubEnvironmentDescription.h
     ${FLAMEGPU_ROOT}/include/flamegpu/model/HostFunctionDescription.h
     ${FLAMEGPU_ROOT}/include/flamegpu/model/Variable.h
-    ${FLAMEGPU_ROOT}/include/flamegpu/pop/detail/MemoryVector.h
-    ${FLAMEGPU_ROOT}/include/flamegpu/pop/detail/GenericMemoryVector.h
-    ${FLAMEGPU_ROOT}/include/flamegpu/pop/AgentVector.h
-    ${FLAMEGPU_ROOT}/include/flamegpu/pop/AgentVector_Agent.h
-    ${FLAMEGPU_ROOT}/include/flamegpu/pop/AgentInstance.h
-    ${FLAMEGPU_ROOT}/include/flamegpu/pop/DeviceAgentVector.h
-    ${FLAMEGPU_ROOT}/include/flamegpu/pop/DeviceAgentVector_impl.h
-    ${FLAMEGPU_ROOT}/include/flamegpu/gpu/CUDAScanCompaction.h
-    ${FLAMEGPU_ROOT}/include/flamegpu/gpu/detail/CUDAErrorChecking.cuh
-    ${FLAMEGPU_ROOT}/include/flamegpu/gpu/detail/CubTemporaryMemory.cuh
-    ${FLAMEGPU_ROOT}/include/flamegpu/gpu/CUDAMessageList.h
-    ${FLAMEGPU_ROOT}/include/flamegpu/gpu/CUDASimulation.h
-    ${FLAMEGPU_ROOT}/include/flamegpu/gpu/CUDAEnsemble.h
-    ${FLAMEGPU_ROOT}/include/flamegpu/gpu/CUDAMessage.h
-    ${FLAMEGPU_ROOT}/include/flamegpu/gpu/CUDAAgent.h
-    ${FLAMEGPU_ROOT}/include/flamegpu/gpu/CUDAAgentStateList.h
-    ${FLAMEGPU_ROOT}/include/flamegpu/gpu/CUDAFatAgent.h
-    ${FLAMEGPU_ROOT}/include/flamegpu/gpu/CUDAFatAgentStateList.h
-    ${FLAMEGPU_ROOT}/include/flamegpu/gpu/CUDAScatter.cuh
-    ${FLAMEGPU_ROOT}/include/flamegpu/gpu/CUDAMacroEnvironment.h
-    ${FLAMEGPU_ROOT}/include/flamegpu/sim/AgentInterface.h
-    ${FLAMEGPU_ROOT}/include/flamegpu/sim/AgentLoggingConfig.h
-    ${FLAMEGPU_ROOT}/include/flamegpu/sim/AgentLoggingConfig_SumReturn.h
-    ${FLAMEGPU_ROOT}/include/flamegpu/sim/AgentLoggingConfig_Reductions.cuh
-    ${FLAMEGPU_ROOT}/include/flamegpu/sim/LoggingConfig.h
-    ${FLAMEGPU_ROOT}/include/flamegpu/sim/LogFrame.h
-    ${FLAMEGPU_ROOT}/include/flamegpu/sim/RunPlan.h
-    ${FLAMEGPU_ROOT}/include/flamegpu/sim/RunPlanVector.h
-    ${FLAMEGPU_ROOT}/include/flamegpu/sim/SimRunner.h
-    ${FLAMEGPU_ROOT}/include/flamegpu/sim/SimLogger.h
-    ${FLAMEGPU_ROOT}/include/flamegpu/sim/Simulation.h
+    ${FLAMEGPU_ROOT}/include/flamegpu/simulation/detail/MemoryVector.h
+    ${FLAMEGPU_ROOT}/include/flamegpu/simulation/detail/GenericMemoryVector.h
+    ${FLAMEGPU_ROOT}/include/flamegpu/simulation/detail/CUDAScanCompaction.h
+    ${FLAMEGPU_ROOT}/include/flamegpu/simulation/detail/CUDAErrorChecking.cuh
+    ${FLAMEGPU_ROOT}/include/flamegpu/simulation/detail/CubTemporaryMemory.cuh
+    ${FLAMEGPU_ROOT}/include/flamegpu/simulation/detail/CUDAMessageList.h
+    ${FLAMEGPU_ROOT}/include/flamegpu/simulation/detail/CUDAMessage.h
+    ${FLAMEGPU_ROOT}/include/flamegpu/simulation/detail/CUDAAgent.h
+    ${FLAMEGPU_ROOT}/include/flamegpu/simulation/detail/CUDAAgentStateList.h
+    ${FLAMEGPU_ROOT}/include/flamegpu/simulation/detail/CUDAFatAgent.h
+    ${FLAMEGPU_ROOT}/include/flamegpu/simulation/detail/CUDAFatAgentStateList.h
+    ${FLAMEGPU_ROOT}/include/flamegpu/simulation/detail/CUDAScatter.cuh
+    ${FLAMEGPU_ROOT}/include/flamegpu/simulation/detail/CUDAMacroEnvironment.h
+    ${FLAMEGPU_ROOT}/include/flamegpu/simulation/detail/SimRunner.h
+    ${FLAMEGPU_ROOT}/include/flamegpu/simulation/detail/SimLogger.h
+    ${FLAMEGPU_ROOT}/include/flamegpu/simulation/detail/AgentInterface.h
+    ${FLAMEGPU_ROOT}/include/flamegpu/simulation/AgentVector.h
+    ${FLAMEGPU_ROOT}/include/flamegpu/simulation/AgentVector_Agent.h
+    ${FLAMEGPU_ROOT}/include/flamegpu/simulation/CUDASimulation.h
+    ${FLAMEGPU_ROOT}/include/flamegpu/simulation/CUDAEnsemble.h
+    ${FLAMEGPU_ROOT}/include/flamegpu/simulation/AgentLoggingConfig.h
+    ${FLAMEGPU_ROOT}/include/flamegpu/simulation/AgentLoggingConfig_SumReturn.h
+    ${FLAMEGPU_ROOT}/include/flamegpu/simulation/AgentLoggingConfig_Reductions.cuh
+    ${FLAMEGPU_ROOT}/include/flamegpu/simulation/LoggingConfig.h
+    ${FLAMEGPU_ROOT}/include/flamegpu/simulation/LogFrame.h
+    ${FLAMEGPU_ROOT}/include/flamegpu/simulation/RunPlan.h
+    ${FLAMEGPU_ROOT}/include/flamegpu/simulation/RunPlanVector.h
+    ${FLAMEGPU_ROOT}/include/flamegpu/simulation/Simulation.h
+    ${FLAMEGPU_ROOT}/include/flamegpu/simulation/detail/EnvironmentManager.cuh
+    ${FLAMEGPU_ROOT}/include/flamegpu/simulation/detail/RandomManager.cuh
     ${FLAMEGPU_ROOT}/include/flamegpu/runtime/AgentFunction.cuh
     ${FLAMEGPU_ROOT}/include/flamegpu/runtime/AgentFunction_shim.cuh
     ${FLAMEGPU_ROOT}/include/flamegpu/runtime/AgentFunctionCondition.cuh
@@ -207,13 +206,16 @@ SET(SRC_INCLUDE
     ${FLAMEGPU_ROOT}/include/flamegpu/runtime/DeviceAPI.cuh
     ${FLAMEGPU_ROOT}/include/flamegpu/runtime/HostAPI.h
     ${FLAMEGPU_ROOT}/include/flamegpu/runtime/HostAPI_macros.h
-    ${FLAMEGPU_ROOT}/include/flamegpu/runtime/HostAgentAPI.cuh
-    ${FLAMEGPU_ROOT}/include/flamegpu/runtime/HostNewAgentAPI.h
     ${FLAMEGPU_ROOT}/include/flamegpu/runtime/detail/SharedBlock.h
     ${FLAMEGPU_ROOT}/include/flamegpu/runtime/detail/curve/Curve.cuh
     ${FLAMEGPU_ROOT}/include/flamegpu/runtime/detail/curve/DeviceCurve.cuh
     ${FLAMEGPU_ROOT}/include/flamegpu/runtime/detail/curve/HostCurve.cuh
     ${FLAMEGPU_ROOT}/include/flamegpu/runtime/detail/curve/curve_rtc.cuh
+    ${FLAMEGPU_ROOT}/include/flamegpu/runtime/agent/HostAgentAPI.cuh
+    ${FLAMEGPU_ROOT}/include/flamegpu/runtime/agent/HostNewAgentAPI.h
+    ${FLAMEGPU_ROOT}/include/flamegpu/runtime/agent/AgentInstance.h
+    ${FLAMEGPU_ROOT}/include/flamegpu/runtime/agent/DeviceAgentVector.h
+    ${FLAMEGPU_ROOT}/include/flamegpu/runtime/agent/DeviceAgentVector_impl.h
     ${FLAMEGPU_ROOT}/include/flamegpu/runtime/messaging.h
     ${FLAMEGPU_ROOT}/include/flamegpu/runtime/messaging_device.h
     ${FLAMEGPU_ROOT}/include/flamegpu/runtime/messaging/MessageSpecialisationHandler.h
@@ -242,30 +244,28 @@ SET(SRC_INCLUDE
     ${FLAMEGPU_ROOT}/include/flamegpu/runtime/messaging/MessageBucket/MessageBucketHost.h
     ${FLAMEGPU_ROOT}/include/flamegpu/runtime/messaging/MessageBucket/MessageBucketDevice.cuh
     ${FLAMEGPU_ROOT}/include/flamegpu/runtime/messaging/MessageSortingType.h
-    ${FLAMEGPU_ROOT}/include/flamegpu/runtime/utility/AgentRandom.cuh
-    ${FLAMEGPU_ROOT}/include/flamegpu/runtime/utility/DeviceEnvironment.cuh
-    ${FLAMEGPU_ROOT}/include/flamegpu/runtime/utility/DeviceMacroProperty.cuh
-    ${FLAMEGPU_ROOT}/include/flamegpu/runtime/utility/EnvironmentManager.cuh
-    ${FLAMEGPU_ROOT}/include/flamegpu/runtime/utility/HostEnvironment.cuh
-    ${FLAMEGPU_ROOT}/include/flamegpu/runtime/utility/HostMacroProperty.cuh
-    ${FLAMEGPU_ROOT}/include/flamegpu/runtime/utility/HostRandom.cuh
-    ${FLAMEGPU_ROOT}/include/flamegpu/runtime/utility/RandomManager.cuh    
-    ${FLAMEGPU_ROOT}/include/flamegpu/util/Any.h
+    ${FLAMEGPU_ROOT}/include/flamegpu/runtime/random/AgentRandom.cuh
+    ${FLAMEGPU_ROOT}/include/flamegpu/runtime/random/HostRandom.cuh
+    ${FLAMEGPU_ROOT}/include/flamegpu/runtime/environment/DeviceEnvironment.cuh
+    ${FLAMEGPU_ROOT}/include/flamegpu/runtime/environment/DeviceMacroProperty.cuh
+    ${FLAMEGPU_ROOT}/include/flamegpu/runtime/environment/HostEnvironment.cuh
+    ${FLAMEGPU_ROOT}/include/flamegpu/runtime/environment/HostMacroProperty.cuh
     ${FLAMEGPU_ROOT}/include/flamegpu/util/cleanup.h
     ${FLAMEGPU_ROOT}/include/flamegpu/util/nvtx.h
     ${FLAMEGPU_ROOT}/include/flamegpu/util/StringPair.h
-    ${FLAMEGPU_ROOT}/include/flamegpu/util/type_decode.h
-    ${FLAMEGPU_ROOT}/include/flamegpu/util/detail/compute_capability.cuh
-    ${FLAMEGPU_ROOT}/include/flamegpu/util/detail/curand.cuh
-    ${FLAMEGPU_ROOT}/include/flamegpu/util/detail/wddm.cuh
-    ${FLAMEGPU_ROOT}/include/flamegpu/util/detail/CUDAEventTimer.cuh
-    ${FLAMEGPU_ROOT}/include/flamegpu/util/detail/cuda.cuh
-    ${FLAMEGPU_ROOT}/include/flamegpu/util/detail/cxxname.hpp
-    ${FLAMEGPU_ROOT}/include/flamegpu/util/detail/SignalHandlers.h
-    ${FLAMEGPU_ROOT}/include/flamegpu/util/detail/StaticAssert.h
-    ${FLAMEGPU_ROOT}/include/flamegpu/util/detail/SteadyClockTimer.h
-    ${FLAMEGPU_ROOT}/include/flamegpu/util/detail/Timer.h
-    ${FLAMEGPU_ROOT}/include/flamegpu/util/detail/JitifyCache.h
+    ${FLAMEGPU_ROOT}/include/flamegpu/detail/Any.h
+    ${FLAMEGPU_ROOT}/include/flamegpu/detail/type_decode.h
+    ${FLAMEGPU_ROOT}/include/flamegpu/detail/compute_capability.cuh
+    ${FLAMEGPU_ROOT}/include/flamegpu/detail/curand.cuh
+    ${FLAMEGPU_ROOT}/include/flamegpu/detail/wddm.cuh
+    ${FLAMEGPU_ROOT}/include/flamegpu/detail/CUDAEventTimer.cuh
+    ${FLAMEGPU_ROOT}/include/flamegpu/detail/cuda.cuh
+    ${FLAMEGPU_ROOT}/include/flamegpu/detail/cxxname.hpp
+    ${FLAMEGPU_ROOT}/include/flamegpu/detail/SignalHandlers.h
+    ${FLAMEGPU_ROOT}/include/flamegpu/detail/StaticAssert.h
+    ${FLAMEGPU_ROOT}/include/flamegpu/detail/SteadyClockTimer.h
+    ${FLAMEGPU_ROOT}/include/flamegpu/detail/Timer.h
+    ${FLAMEGPU_ROOT}/include/flamegpu/detail/JitifyCache.h
 )
 SET(SRC_FLAMEGPU
     ${FLAMEGPU_ROOT}/src/flamegpu/exception/FLAMEGPUException.cpp
@@ -289,35 +289,37 @@ SET(SRC_FLAMEGPU
     ${FLAMEGPU_ROOT}/src/flamegpu/model/SubEnvironmentData.cpp
     ${FLAMEGPU_ROOT}/src/flamegpu/model/SubEnvironmentDescription.cpp
     ${FLAMEGPU_ROOT}/src/flamegpu/model/HostFunctionDescription.cpp
-    ${FLAMEGPU_ROOT}/src/flamegpu/pop/AgentVector.cpp
-    ${FLAMEGPU_ROOT}/src/flamegpu/pop/AgentVector_Agent.cpp
-    ${FLAMEGPU_ROOT}/src/flamegpu/pop/AgentInstance.cpp
-    ${FLAMEGPU_ROOT}/src/flamegpu/pop/DeviceAgentVector_impl.cu
-    ${FLAMEGPU_ROOT}/src/flamegpu/gpu/detail/CubTemporaryMemory.cu
-    ${FLAMEGPU_ROOT}/src/flamegpu/gpu/CUDAScanCompaction.cu
-    ${FLAMEGPU_ROOT}/src/flamegpu/gpu/CUDAMessageList.cu
-    ${FLAMEGPU_ROOT}/src/flamegpu/gpu/CUDAAgent.cu
-    ${FLAMEGPU_ROOT}/src/flamegpu/gpu/CUDAAgentStateList.cu
-    ${FLAMEGPU_ROOT}/src/flamegpu/gpu/CUDAFatAgent.cu
-    ${FLAMEGPU_ROOT}/src/flamegpu/gpu/CUDAFatAgentStateList.cu
-    ${FLAMEGPU_ROOT}/src/flamegpu/gpu/CUDAMessage.cu
-    ${FLAMEGPU_ROOT}/src/flamegpu/gpu/CUDAScatter.cu
-    ${FLAMEGPU_ROOT}/src/flamegpu/gpu/CUDASimulation.cu
-    ${FLAMEGPU_ROOT}/src/flamegpu/gpu/CUDAEnsemble.cu
-    ${FLAMEGPU_ROOT}/src/flamegpu/gpu/CUDAMacroEnvironment.cu
-    ${FLAMEGPU_ROOT}/src/flamegpu/sim/AgentLoggingConfig.cu
-    ${FLAMEGPU_ROOT}/src/flamegpu/sim/LoggingConfig.cu
-    ${FLAMEGPU_ROOT}/src/flamegpu/sim/LogFrame.cu
-    ${FLAMEGPU_ROOT}/src/flamegpu/sim/RunPlan.cpp
-    ${FLAMEGPU_ROOT}/src/flamegpu/sim/RunPlanVector.cpp
-    ${FLAMEGPU_ROOT}/src/flamegpu/sim/SimRunner.cu
-    ${FLAMEGPU_ROOT}/src/flamegpu/sim/SimLogger.cu
-    ${FLAMEGPU_ROOT}/src/flamegpu/sim/Simulation.cu
+    ${FLAMEGPU_ROOT}/src/flamegpu/simulation/detail/CubTemporaryMemory.cu
+    ${FLAMEGPU_ROOT}/src/flamegpu/simulation/detail/CUDAScanCompaction.cu
+    ${FLAMEGPU_ROOT}/src/flamegpu/simulation/detail/CUDAMessageList.cu
+    ${FLAMEGPU_ROOT}/src/flamegpu/simulation/detail/CUDAAgent.cu
+    ${FLAMEGPU_ROOT}/src/flamegpu/simulation/detail/CUDAAgentStateList.cu
+    ${FLAMEGPU_ROOT}/src/flamegpu/simulation/detail/CUDAFatAgent.cu
+    ${FLAMEGPU_ROOT}/src/flamegpu/simulation/detail/CUDAFatAgentStateList.cu
+    ${FLAMEGPU_ROOT}/src/flamegpu/simulation/detail/CUDAMessage.cu
+    ${FLAMEGPU_ROOT}/src/flamegpu/simulation/detail/CUDAScatter.cu
+    ${FLAMEGPU_ROOT}/src/flamegpu/simulation/detail/CUDAMacroEnvironment.cu
+    ${FLAMEGPU_ROOT}/src/flamegpu/simulation/detail/SimRunner.cu
+    ${FLAMEGPU_ROOT}/src/flamegpu/simulation/detail/SimLogger.cu
+    ${FLAMEGPU_ROOT}/src/flamegpu/simulation/detail/EnvironmentManager.cu
+    ${FLAMEGPU_ROOT}/src/flamegpu/simulation/detail/RandomManager.cu
+    ${FLAMEGPU_ROOT}/src/flamegpu/simulation/AgentVector.cpp
+    ${FLAMEGPU_ROOT}/src/flamegpu/simulation/AgentVector_Agent.cpp
+    ${FLAMEGPU_ROOT}/src/flamegpu/simulation/AgentInstance.cpp
+    ${FLAMEGPU_ROOT}/src/flamegpu/simulation/CUDASimulation.cu
+    ${FLAMEGPU_ROOT}/src/flamegpu/simulation/CUDAEnsemble.cu
+    ${FLAMEGPU_ROOT}/src/flamegpu/simulation/AgentLoggingConfig.cu
+    ${FLAMEGPU_ROOT}/src/flamegpu/simulation/LoggingConfig.cu
+    ${FLAMEGPU_ROOT}/src/flamegpu/simulation/LogFrame.cu
+    ${FLAMEGPU_ROOT}/src/flamegpu/simulation/RunPlan.cpp
+    ${FLAMEGPU_ROOT}/src/flamegpu/simulation/RunPlanVector.cpp
+    ${FLAMEGPU_ROOT}/src/flamegpu/simulation/Simulation.cu
     ${FLAMEGPU_ROOT}/src/flamegpu/runtime/detail/curve/Curve.cu
     ${FLAMEGPU_ROOT}/src/flamegpu/runtime/detail/curve/curve_rtc.cpp
     ${FLAMEGPU_ROOT}/src/flamegpu/runtime/detail/curve/HostCurve.cu
+    ${FLAMEGPU_ROOT}/src/flamegpu/runtime/agent/DeviceAgentVector_impl.cu
+    ${FLAMEGPU_ROOT}/src/flamegpu/runtime/agent/HostAgentAPI.cu 
     ${FLAMEGPU_ROOT}/src/flamegpu/runtime/HostAPI.cu
-    ${FLAMEGPU_ROOT}/src/flamegpu/runtime/HostAgentAPI.cu 
     ${FLAMEGPU_ROOT}/src/flamegpu/runtime/messaging/MessageBruteForce.cu
     ${FLAMEGPU_ROOT}/src/flamegpu/runtime/messaging/MessageSpatial2D.cu
     ${FLAMEGPU_ROOT}/src/flamegpu/runtime/messaging/MessageSpatial3D.cu
@@ -325,21 +327,19 @@ SET(SRC_FLAMEGPU
     ${FLAMEGPU_ROOT}/src/flamegpu/runtime/messaging/MessageArray2D.cu
     ${FLAMEGPU_ROOT}/src/flamegpu/runtime/messaging/MessageArray3D.cu
     ${FLAMEGPU_ROOT}/src/flamegpu/runtime/messaging/MessageBucket.cu
+    ${FLAMEGPU_ROOT}/src/flamegpu/runtime/environment/HostEnvironment.cu
+    ${FLAMEGPU_ROOT}/src/flamegpu/runtime/random/HostRandom.cu
     ${FLAMEGPU_ROOT}/src/flamegpu/io/JSONStateReader.cpp
     ${FLAMEGPU_ROOT}/src/flamegpu/io/JSONStateWriter.cpp
     ${FLAMEGPU_ROOT}/src/flamegpu/io/XMLStateReader.cpp
     ${FLAMEGPU_ROOT}/src/flamegpu/io/XMLStateWriter.cpp
-	${FLAMEGPU_ROOT}/src/flamegpu/io/Telemetry.cpp
     ${FLAMEGPU_ROOT}/src/flamegpu/io/XMLLogger.cu
     ${FLAMEGPU_ROOT}/src/flamegpu/io/JSONLogger.cu
-    ${FLAMEGPU_ROOT}/src/flamegpu/runtime/utility/HostEnvironment.cu
-    ${FLAMEGPU_ROOT}/src/flamegpu/runtime/utility/EnvironmentManager.cu
-    ${FLAMEGPU_ROOT}/src/flamegpu/runtime/utility/RandomManager.cu
-    ${FLAMEGPU_ROOT}/src/flamegpu/runtime/utility/HostRandom.cu
+    ${FLAMEGPU_ROOT}/src/flamegpu/io/Telemetry.cpp
     ${FLAMEGPU_ROOT}/src/flamegpu/util/cleanup.cu
-    ${FLAMEGPU_ROOT}/src/flamegpu/util/detail/compute_capability.cu
-    ${FLAMEGPU_ROOT}/src/flamegpu/util/detail/wddm.cu
-    ${FLAMEGPU_ROOT}/src/flamegpu/util/detail/JitifyCache.cu
+    ${FLAMEGPU_ROOT}/src/flamegpu/detail/compute_capability.cu
+    ${FLAMEGPU_ROOT}/src/flamegpu/detail/wddm.cu
+    ${FLAMEGPU_ROOT}/src/flamegpu/detail/JitifyCache.cu
 )
 SET(SRC_DYNAMIC
     ${DYNAMIC_VERSION_SRC_DEST}
diff --git a/src/flamegpu/util/detail/JitifyCache.cu b/src/flamegpu/detail/JitifyCache.cu
similarity index 97%
rename from src/flamegpu/util/detail/JitifyCache.cu
rename to src/flamegpu/detail/JitifyCache.cu
index 269f0d18a..0813fd983 100644
--- a/src/flamegpu/util/detail/JitifyCache.cu
+++ b/src/flamegpu/detail/JitifyCache.cu
@@ -1,4 +1,4 @@
-#include "flamegpu/util/detail/JitifyCache.h"
+#include "flamegpu/detail/JitifyCache.h"
 
 #include <nvrtc.h>
 
@@ -9,14 +9,13 @@
 
 #include "flamegpu/version.h"
 #include "flamegpu/exception/FLAMEGPUException.h"
-#include "flamegpu/util/detail/compute_capability.cuh"
+#include "flamegpu/detail/compute_capability.cuh"
 #include "flamegpu/util/nvtx.h"
 
 using jitify::detail::hash_combine;
 using jitify::detail::hash_larson64;
 
 namespace flamegpu {
-namespace util {
 namespace detail {
 
 namespace {
@@ -309,7 +308,7 @@ std::unique_ptr<KernelInstantiation> JitifyCache::compileKernel(const std::strin
 #endif
 
     // Set the cuda compuate capability architecture to optimize / generate for, based on the values supported by the current dynamiclaly linked nvrtc and the device in question.
-    std::vector<int> nvrtcArchitectures = util::detail::compute_capability::getNVRTCSupportedComputeCapabilties();
+    std::vector<int> nvrtcArchitectures = detail::compute_capability::getNVRTCSupportedComputeCapabilties();
     if (nvrtcArchitectures.size()) {
         int currentDeviceIdx = 0;
         if (cudaSuccess == cudaGetDevice(&currentDeviceIdx)) {
@@ -415,7 +414,7 @@ void JitifyCache::getKnownHeaders(std::vector<std::string>& headers) {
     headers.push_back("flamegpu/defines.h");
     headers.push_back("flamegpu/exception/FLAMEGPUDeviceException.cuh");
     headers.push_back("flamegpu/exception/FLAMEGPUDeviceException_device.cuh");
-    headers.push_back("flamegpu/gpu/CUDAScanCompaction.h");
+    headers.push_back("flamegpu/simulation/detail/CUDAScanCompaction.h");
     headers.push_back("flamegpu/runtime/AgentFunction.cuh");
     headers.push_back("flamegpu/runtime/AgentFunctionCondition.cuh");
     headers.push_back("flamegpu/runtime/AgentFunctionCondition_shim.cuh");
@@ -436,11 +435,11 @@ void JitifyCache::getKnownHeaders(std::vector<std::string>& headers) {
     headers.push_back("flamegpu/runtime/messaging/MessageSpatial3D.h");
     headers.push_back("flamegpu/runtime/messaging/MessageSpatial3D/MessageSpatial3DDevice.cuh");
     headers.push_back("flamegpu/runtime/messaging/MessageNone.h");
-    headers.push_back("flamegpu/runtime/utility/AgentRandom.cuh");
-    headers.push_back("flamegpu/runtime/utility/DeviceEnvironment.cuh");
-    headers.push_back("flamegpu/runtime/utility/DeviceMacroProperty.cuh");
-    headers.push_back("flamegpu/util/detail/StaticAssert.h");
-    headers.push_back("flamegpu/util/type_decode.h");
+    headers.push_back("flamegpu/runtime/random/AgentRandom.cuh");
+    headers.push_back("flamegpu/runtime/environment/DeviceEnvironment.cuh");
+    headers.push_back("flamegpu/runtime/environment/DeviceMacroProperty.cuh");
+    headers.push_back("flamegpu/detail/StaticAssert.h");
+    headers.push_back("flamegpu/detail/type_decode.h");
     // headers.push_back("jitify_preinclude.h");  // I think Jitify adds this itself
     headers.push_back("limits");
     headers.push_back("limits.h");
@@ -581,5 +580,4 @@ JitifyCache& JitifyCache::getInstance() {
 }
 
 }  // namespace detail
-}  // namespace util
 }  // namespace flamegpu
diff --git a/src/flamegpu/util/detail/compute_capability.cu b/src/flamegpu/detail/compute_capability.cu
similarity index 97%
rename from src/flamegpu/util/detail/compute_capability.cu
rename to src/flamegpu/detail/compute_capability.cu
index 85a1086c0..c8e820f1e 100644
--- a/src/flamegpu/util/detail/compute_capability.cu
+++ b/src/flamegpu/detail/compute_capability.cu
@@ -2,12 +2,11 @@
 
 #include <cassert>
 
-#include "flamegpu/util/detail/compute_capability.cuh"
-#include "flamegpu/gpu/detail/CUDAErrorChecking.cuh"
+#include "flamegpu/detail/compute_capability.cuh"
+#include "flamegpu/simulation/detail/CUDAErrorChecking.cuh"
 
 
 namespace flamegpu {
-namespace util {
 namespace detail {
 
 int compute_capability::getComputeCapability(int deviceIndex) {
@@ -152,5 +151,4 @@ const std::string compute_capability::getDeviceNames(std::set<int> devices) {
 
 
 }  // namespace detail
-}  // namespace util
 }  // namespace flamegpu
diff --git a/src/flamegpu/util/detail/wddm.cu b/src/flamegpu/detail/wddm.cu
similarity index 90%
rename from src/flamegpu/util/detail/wddm.cu
rename to src/flamegpu/detail/wddm.cu
index 8cf42cc00..fd0fd5be1 100644
--- a/src/flamegpu/util/detail/wddm.cu
+++ b/src/flamegpu/detail/wddm.cu
@@ -1,8 +1,7 @@
-#include "flamegpu/util/detail/wddm.cuh"
-#include "flamegpu/gpu/detail/CUDAErrorChecking.cuh"
+#include "flamegpu/detail/wddm.cuh"
+#include "flamegpu/simulation/detail/CUDAErrorChecking.cuh"
 
 namespace flamegpu {
-namespace util {
 namespace detail {
 
 bool wddm::deviceIsWDDM(int deviceIndex) {
@@ -39,5 +38,4 @@ bool wddm::deviceIsWDDM() {
 }
 
 }  // namespace detail
-}  // namespace util
 }  // namespace flamegpu
diff --git a/src/flamegpu/exception/FLAMEGPUDeviceException.cu b/src/flamegpu/exception/FLAMEGPUDeviceException.cu
index 99f978b7d..517752a94 100644
--- a/src/flamegpu/exception/FLAMEGPUDeviceException.cu
+++ b/src/flamegpu/exception/FLAMEGPUDeviceException.cu
@@ -1,7 +1,7 @@
 #include "flamegpu/exception/FLAMEGPUDeviceException.cuh"
 
-#include "flamegpu/gpu/detail/CUDAErrorChecking.cuh"
-#include "flamegpu/util/detail/cuda.cuh"
+#include "flamegpu/simulation/detail/CUDAErrorChecking.cuh"
+#include "flamegpu/detail/cuda.cuh"
 
 #if !defined(FLAMEGPU_SEATBELTS) || FLAMEGPU_SEATBELTS
 
@@ -16,13 +16,13 @@ DeviceExceptionManager::DeviceExceptionManager()
 }
 DeviceExceptionManager::~DeviceExceptionManager() {
     for (auto &i : d_buffer) {
-        gpuErrchk(flamegpu::util::detail::cuda::cudaFree(i));
+        gpuErrchk(flamegpu::detail::cuda::cudaFree(i));
     }
 }
 DeviceExceptionBuffer *DeviceExceptionManager::getDevicePtr(const unsigned int streamId, const cudaStream_t stream) {
-    if (streamId >= CUDAScanCompaction::MAX_STREAMS) {
+    if (streamId >= detail::CUDAScanCompaction::MAX_STREAMS) {
         THROW exception::OutOfBoundsException("Stream id %u is out of bounds, %u >= %u, "
-        "in FLAMEGPUDeviceException::getDevicePtr()\n", streamId, streamId, CUDAScanCompaction::MAX_STREAMS);
+        "in FLAMEGPUDeviceException::getDevicePtr()\n", streamId, streamId, detail::CUDAScanCompaction::MAX_STREAMS);
     }
     // It may be better to move this (and the memsets) out to a separate up-front reset call in the future.
     if (!d_buffer[streamId]) {
@@ -37,9 +37,9 @@ DeviceExceptionBuffer *DeviceExceptionManager::getDevicePtr(const unsigned int s
     return d_buffer[streamId];
 }
 void DeviceExceptionManager::checkError(const std::string &function, const unsigned int streamId, const cudaStream_t stream) {
-    if (streamId >= CUDAScanCompaction::MAX_STREAMS) {
+    if (streamId >= detail::CUDAScanCompaction::MAX_STREAMS) {
         THROW exception::OutOfBoundsException("Stream id %u is out of bounds, %u >= %u, "
-        "in FLAMEGPUDeviceException::checkError()\n", streamId, streamId, CUDAScanCompaction::MAX_STREAMS);
+        "in FLAMEGPUDeviceException::checkError()\n", streamId, streamId, detail::CUDAScanCompaction::MAX_STREAMS);
     }
     if (d_buffer[streamId]) {
         // Grab buffer from device
@@ -55,7 +55,7 @@ void DeviceExceptionManager::checkError(const std::string &function, const unsig
         }
     } else {
         THROW exception::OutOfBoundsException("FLAMEGPUDeviceExceptionBuffer for stream %u has not been allocated, "
-        "in FLAMEGPUDeviceException::checkError()\n", streamId, streamId, CUDAScanCompaction::MAX_STREAMS);
+        "in FLAMEGPUDeviceException::checkError()\n", streamId, streamId, detail::CUDAScanCompaction::MAX_STREAMS);
     }
 }
 std::string DeviceExceptionManager::getLocationString(const DeviceExceptionBuffer &b) {
diff --git a/src/flamegpu/io/JSONLogger.cu b/src/flamegpu/io/JSONLogger.cu
index e25284e08..95a609c5a 100644
--- a/src/flamegpu/io/JSONLogger.cu
+++ b/src/flamegpu/io/JSONLogger.cu
@@ -7,8 +7,8 @@
 #include <fstream>
 #include <string>
 
-#include "flamegpu/sim/RunPlan.h"
-#include "flamegpu/sim/LogFrame.h"
+#include "flamegpu/simulation/RunPlan.h"
+#include "flamegpu/simulation/LogFrame.h"
 
 namespace flamegpu {
 namespace io {
@@ -26,7 +26,7 @@ void JSONLogger::log(const RunLog &log, bool logConfig, bool logSteps, bool logE
 }
 
 template<typename T>
-void JSONLogger::writeAny(T &writer, const util::Any &value, const unsigned int elements) const {
+void JSONLogger::writeAny(T &writer, const detail::Any &value, const unsigned int elements) const {
     // Output value
     if (elements > 1) {
         writer.StartArray();
diff --git a/src/flamegpu/io/JSONStateReader.cpp b/src/flamegpu/io/JSONStateReader.cpp
index 9814f602a..0c0d6c17a 100644
--- a/src/flamegpu/io/JSONStateReader.cpp
+++ b/src/flamegpu/io/JSONStateReader.cpp
@@ -10,9 +10,9 @@
 #include <cerrno>
 
 #include "flamegpu/exception/FLAMEGPUException.h"
-#include "flamegpu/pop/AgentVector.h"
+#include "flamegpu/simulation/AgentVector.h"
 #include "flamegpu/model/AgentDescription.h"
-#include "flamegpu/gpu/CUDASimulation.h"
+#include "flamegpu/simulation/CUDASimulation.h"
 #include "flamegpu/util/StringPair.h"
 
 namespace flamegpu {
@@ -21,7 +21,7 @@ namespace io {
 JSONStateReader::JSONStateReader(
     const std::string &model_name,
     const std::unordered_map<std::string, EnvironmentData::PropData> &env_desc,
-    std::unordered_map<std::string, util::Any> &env_init,
+    std::unordered_map<std::string, detail::Any> &env_init,
     util::StringPairUnorderedMap<std::shared_ptr<AgentVector>> &model_state,
     const std::string &input,
     Simulation *sim_instance)
@@ -36,7 +36,7 @@ class JSONStateReader_impl : public rapidjson::BaseReaderHandler<rapidjson::UTF8
     std::string lastKey;
     std::string filename;
     const std::unordered_map<std::string, EnvironmentData::PropData> env_desc;
-    std::unordered_map<std::string, util::Any> &env_init;
+    std::unordered_map<std::string, detail::Any> &env_init;
     /**
      * Used for setting agent values
      */
@@ -57,7 +57,7 @@ class JSONStateReader_impl : public rapidjson::BaseReaderHandler<rapidjson::UTF8
  public:
     JSONStateReader_impl(const std::string &_filename,
         const std::unordered_map<std::string, EnvironmentData::PropData> &_env_desc,
-        std::unordered_map<std::string, util::Any> &_env_init,
+        std::unordered_map<std::string, detail::Any> &_env_init,
         util::StringPairUnorderedMap<std::shared_ptr<AgentVector>> &_model_state)
         : filename(_filename)
         , env_desc(_env_desc)
@@ -78,7 +78,7 @@ class JSONStateReader_impl : public rapidjson::BaseReaderHandler<rapidjson::UTF8
             }
             if (current_variable_array_index == 0) {
                 // New property, create buffer with default value and add to map
-                if (!env_init.emplace(lastKey, util::Any(it->second.data)).second) {
+                if (!env_init.emplace(lastKey, detail::Any(it->second.data)).second) {
                     THROW exception::RapidJSONError("Input file contains environment property '%s' multiple times, "
                         "in JSONStateReader::parse()\n", lastKey.c_str());
                 }
diff --git a/src/flamegpu/io/JSONStateWriter.cpp b/src/flamegpu/io/JSONStateWriter.cpp
index eb7e39a4b..083b95314 100644
--- a/src/flamegpu/io/JSONStateWriter.cpp
+++ b/src/flamegpu/io/JSONStateWriter.cpp
@@ -9,17 +9,17 @@
 
 #include "flamegpu/exception/FLAMEGPUException.h"
 #include "flamegpu/model/AgentDescription.h"
-#include "flamegpu/pop/AgentVector.h"
-#include "flamegpu/gpu/CUDASimulation.h"
+#include "flamegpu/simulation/AgentVector.h"
+#include "flamegpu/simulation/CUDASimulation.h"
 #include "flamegpu/util/StringPair.h"
-#include "flamegpu/runtime/utility/EnvironmentManager.cuh"
+#include "flamegpu/simulation/detail/EnvironmentManager.cuh"
 
 namespace flamegpu {
 namespace io {
 
 JSONStateWriter::JSONStateWriter(
     const std::string &model_name,
-    const std::shared_ptr<EnvironmentManager> &env_manager,
+    const std::shared_ptr<detail::EnvironmentManager> &env_manager,
     const util::StringPairUnorderedMap<std::shared_ptr<AgentVector>>&model,
     const unsigned int iterations,
     const std::string &output_file,
diff --git a/src/flamegpu/io/XMLLogger.cu b/src/flamegpu/io/XMLLogger.cu
index 746721b29..49fc4cb79 100644
--- a/src/flamegpu/io/XMLLogger.cu
+++ b/src/flamegpu/io/XMLLogger.cu
@@ -4,8 +4,8 @@
 
 #include "tinyxml2/tinyxml2.h"              // downloaded from https:// github.com/leethomason/tinyxml2, the list of xml parsers : http:// lars.ruoff.free.fr/xmlcpp/
 
-#include "flamegpu/sim/RunPlan.h"
-#include "flamegpu/sim/LogFrame.h"
+#include "flamegpu/simulation/RunPlan.h"
+#include "flamegpu/simulation/LogFrame.h"
 
 namespace flamegpu {
 namespace io {
@@ -282,7 +282,7 @@ void XMLLogger::writeCommonLogFrame(tinyxml2::XMLDocument &doc, tinyxml2::XMLEle
     }
 }
 
-void XMLLogger::writeAny(tinyxml2::XMLElement *pElement, const util::Any &value, const unsigned int elements) const {
+void XMLLogger::writeAny(tinyxml2::XMLElement *pElement, const detail::Any &value, const unsigned int elements) const {
     std::stringstream ss;
     // Loop through elements, to construct csv string
     for (unsigned int el = 0; el < elements; ++el) {
diff --git a/src/flamegpu/io/XMLStateReader.cpp b/src/flamegpu/io/XMLStateReader.cpp
index 80cabfdd1..5aa3b0b66 100644
--- a/src/flamegpu/io/XMLStateReader.cpp
+++ b/src/flamegpu/io/XMLStateReader.cpp
@@ -14,9 +14,9 @@
 #include <tuple>
 #include "tinyxml2/tinyxml2.h"              // downloaded from https:// github.com/leethomason/tinyxml2, the list of xml parsers : http:// lars.ruoff.free.fr/xmlcpp/
 #include "flamegpu/exception/FLAMEGPUException.h"
-#include "flamegpu/pop/AgentVector.h"
+#include "flamegpu/simulation/AgentVector.h"
 #include "flamegpu/model/AgentDescription.h"
-#include "flamegpu/gpu/CUDASimulation.h"
+#include "flamegpu/simulation/CUDASimulation.h"
 
 namespace flamegpu {
 namespace io {
@@ -67,7 +67,7 @@ namespace io {
 XMLStateReader::XMLStateReader(
     const std::string &model_name,
     const std::unordered_map<std::string, EnvironmentData::PropData> &env_desc,
-    std::unordered_map<std::string, util::Any>&env_init,
+    std::unordered_map<std::string, detail::Any>&env_init,
     util::StringPairUnorderedMap<std::shared_ptr<AgentVector>> &model_state,
     const std::string &input,
     Simulation *sim_instance)
@@ -207,7 +207,7 @@ int XMLStateReader::parse() {
             unsigned int el = 0;
             while (getline(ss, token, ',')) {
                 if (el == 0) {
-                    if (!env_init.emplace(std::string(key), util::Any(it->second.data)).second) {
+                    if (!env_init.emplace(std::string(key), detail::Any(it->second.data)).second) {
                         THROW exception::TinyXMLError("Input file contains environment property '%s' multiple times, "
                             "in XMLStateReader::parse()\n", key);
                     }
diff --git a/src/flamegpu/io/XMLStateWriter.cpp b/src/flamegpu/io/XMLStateWriter.cpp
index ecdbce35e..179462d89 100644
--- a/src/flamegpu/io/XMLStateWriter.cpp
+++ b/src/flamegpu/io/XMLStateWriter.cpp
@@ -12,9 +12,9 @@
 #include "tinyxml2/tinyxml2.h"              // downloaded from https:// github.com/leethomason/tinyxml2, the list of xml parsers : http:// lars.ruoff.free.fr/xmlcpp/
 #include "flamegpu/exception/FLAMEGPUException.h"
 #include "flamegpu/model/AgentDescription.h"
-#include "flamegpu/gpu/CUDASimulation.h"
-#include "flamegpu/pop/AgentVector.h"
-#include "flamegpu/runtime/utility/EnvironmentManager.cuh"
+#include "flamegpu/simulation/CUDASimulation.h"
+#include "flamegpu/simulation/AgentVector.h"
+#include "flamegpu/simulation/detail/EnvironmentManager.cuh"
 
 namespace flamegpu {
 namespace io {
@@ -64,7 +64,7 @@ namespace io {
 
 XMLStateWriter::XMLStateWriter(
     const std::string &model_name,
-    const std::shared_ptr<EnvironmentManager>& env_manager,
+    const std::shared_ptr<detail::EnvironmentManager>& env_manager,
     const util::StringPairUnorderedMap<std::shared_ptr<AgentVector>> &model,
     const unsigned int iterations,
     const std::string &output_file,
diff --git a/src/flamegpu/model/AgentFunctionData.cpp b/src/flamegpu/model/AgentFunctionData.cpp
index dccd3a3cc..d2b51629e 100644
--- a/src/flamegpu/model/AgentFunctionData.cpp
+++ b/src/flamegpu/model/AgentFunctionData.cpp
@@ -3,7 +3,7 @@
 #include "flamegpu/model/AgentDescription.h"
 #include "flamegpu/model/AgentFunctionDescription.h"
 #include "flamegpu/runtime/detail/curve/curve_rtc.cuh"
-#include "flamegpu/util/detail/cxxname.hpp"
+#include "flamegpu/detail/cxxname.hpp"
 
 namespace flamegpu {
 
@@ -58,22 +58,22 @@ AgentFunctionData::AgentFunctionData(const std::shared_ptr<const ModelData> &_mo
             if (_m != _model->messages.end()) {
                 message_input = _m->second;
             }
-        } else if (util::detail::cxxname::getUnqualifiedName(other.message_in_type) != util::detail::cxxname::getUnqualifiedName(detail::curve::CurveRTCHost::demangle(std::type_index(typeid(MessageNone))))) {
+        } else if (detail::cxxname::getUnqualifiedName(other.message_in_type) != detail::cxxname::getUnqualifiedName(detail::curve::CurveRTCHost::demangle(std::type_index(typeid(MessageNone))))) {
             THROW exception::InvalidMessageType(
                 "Function '%s' is missing bound input message of type '%s', type provided was '%s'.", other.name.c_str(),
-                util::detail::cxxname::getUnqualifiedName(other.message_in_type).c_str(),
-                util::detail::cxxname::getUnqualifiedName(detail::curve::CurveRTCHost::demangle(std::type_index(typeid(MessageNone)))).c_str());
+                detail::cxxname::getUnqualifiedName(other.message_in_type).c_str(),
+                detail::cxxname::getUnqualifiedName(detail::curve::CurveRTCHost::demangle(std::type_index(typeid(MessageNone)))).c_str());
         }
         if (auto a = other.message_output.lock()) {
             auto _m = _model->messages.find(a->name);
             if (_m != _model->messages.end()) {
                 message_output = _m->second;
             }
-        } else if (util::detail::cxxname::getUnqualifiedName(other.message_out_type) != util::detail::cxxname::getUnqualifiedName(detail::curve::CurveRTCHost::demangle(std::type_index(typeid(MessageNone))))) {
+        } else if (detail::cxxname::getUnqualifiedName(other.message_out_type) != detail::cxxname::getUnqualifiedName(detail::curve::CurveRTCHost::demangle(std::type_index(typeid(MessageNone))))) {
             THROW exception::InvalidMessageType(
                 "Function '%s' is missing bound output message of type '%s', type provided was '%s'.", other.name.c_str(),
-                util::detail::cxxname::getUnqualifiedName(other.message_out_type).c_str(),
-                util::detail::cxxname::getUnqualifiedName(detail::curve::CurveRTCHost::demangle(std::type_index(typeid(MessageNone)))).c_str());
+                detail::cxxname::getUnqualifiedName(other.message_out_type).c_str(),
+                detail::cxxname::getUnqualifiedName(detail::curve::CurveRTCHost::demangle(std::type_index(typeid(MessageNone)))).c_str());
         }
         if (auto a = other.agent_output.lock()) {
             auto _a = _model->agents.find(a->name);
diff --git a/src/flamegpu/model/AgentFunctionDescription.cpp b/src/flamegpu/model/AgentFunctionDescription.cpp
index 35d72b8df..74bbe8f94 100644
--- a/src/flamegpu/model/AgentFunctionDescription.cpp
+++ b/src/flamegpu/model/AgentFunctionDescription.cpp
@@ -7,7 +7,7 @@
 #include <regex>
 
 #include "flamegpu/model/AgentFunctionDescription.h"
-#include "flamegpu/util/detail/cxxname.hpp"
+#include "flamegpu/detail/cxxname.hpp"
 
 namespace flamegpu {
 
@@ -207,8 +207,8 @@ void AgentFunctionDescription::setMessageInput(const std::string &message_name)
     auto a = mdl->messages.find(message_name);
     if (a != mdl->messages.end()) {
         // Just compare the classname is the same, to allow for the various approaches to namespace use. This should only be required for RTC functions.
-        auto message_in_classname = util::detail::cxxname::getUnqualifiedName(this->function->message_in_type);
-        auto demangledClassName = util::detail::cxxname::getUnqualifiedName(detail::curve::CurveRTCHost::demangle(a->second->getType()));
+        auto message_in_classname = detail::cxxname::getUnqualifiedName(this->function->message_in_type);
+        auto demangledClassName = detail::cxxname::getUnqualifiedName(detail::curve::CurveRTCHost::demangle(a->second->getType()));
         if (message_in_classname == demangledClassName) {
             this->function->message_input = a->second;
         } else {
@@ -243,8 +243,8 @@ void AgentFunctionDescription::setMessageInput(MessageBruteForce::CDescription m
     if (a != mdl->messages.end()) {
         if (a->second == message.message) {
             // Just compare the classname is the same, to allow for the various approaches to namespace use. This should only be required for RTC functions.
-            auto message_in_classname = util::detail::cxxname::getUnqualifiedName(this->function->message_in_type);
-            auto demangledClassName = util::detail::cxxname::getUnqualifiedName(detail::curve::CurveRTCHost::demangle(a->second->getType()));
+            auto message_in_classname = detail::cxxname::getUnqualifiedName(this->function->message_in_type);
+            auto demangledClassName = detail::cxxname::getUnqualifiedName(detail::curve::CurveRTCHost::demangle(a->second->getType()));
             if (message_in_classname == demangledClassName) {
                 this->function->message_input = a->second;
             } else {
@@ -285,8 +285,8 @@ void AgentFunctionDescription::setMessageOutput(const std::string &message_name)
     auto a = mdl->messages.find(message_name);
     if (a != mdl->messages.end()) {
         // Just compare the classname is the same, to allow for the various approaches to namespace use. This should only be required for RTC functions.
-        auto message_out_classname = util::detail::cxxname::getUnqualifiedName(this->function->message_out_type);
-        auto demangledClassName = util::detail::cxxname::getUnqualifiedName(detail::curve::CurveRTCHost::demangle(a->second->getType()));
+        auto message_out_classname = detail::cxxname::getUnqualifiedName(this->function->message_out_type);
+        auto demangledClassName = detail::cxxname::getUnqualifiedName(detail::curve::CurveRTCHost::demangle(a->second->getType()));
         if (message_out_classname == demangledClassName) {
             this->function->message_output = a->second;
             if (this->function->message_output_optional) {
@@ -330,8 +330,8 @@ void AgentFunctionDescription::setMessageOutput(MessageBruteForce::CDescription
     if (a != mdl->messages.end()) {
         if (a->second == message.message) {
             // Just compare the classname is the same, to allow for the various approaches to namespace use. This should only be required for RTC functions.
-            auto message_out_classname = util::detail::cxxname::getUnqualifiedName(this->function->message_out_type);
-            auto demangledClassName = util::detail::cxxname::getUnqualifiedName(detail::curve::CurveRTCHost::demangle(a->second->getType()));
+            auto message_out_classname = detail::cxxname::getUnqualifiedName(this->function->message_out_type);
+            auto demangledClassName = detail::cxxname::getUnqualifiedName(detail::curve::CurveRTCHost::demangle(a->second->getType()));
             if (message_out_classname == demangledClassName) {
                 this->function->message_output = a->second;
                 if (this->function->message_output_optional) {
diff --git a/src/flamegpu/model/EnvironmentData.cpp b/src/flamegpu/model/EnvironmentData.cpp
index aa0bf7a72..71faf22a1 100644
--- a/src/flamegpu/model/EnvironmentData.cpp
+++ b/src/flamegpu/model/EnvironmentData.cpp
@@ -6,7 +6,7 @@ EnvironmentData::EnvironmentData(std::shared_ptr<const ModelData> _model)
     : model(_model) {
     // Add CUDASimulation specific environment members
     // We do this here, to not break comparing different model description hierarchies before/after CUDASimulation creation
-    properties.emplace("_stepCount", PropData(false, util::Any(0u)));
+    properties.emplace("_stepCount", PropData(false, detail::Any(0u)));
 }
 
 EnvironmentData::EnvironmentData(std::shared_ptr<const ModelData> _model, const EnvironmentData& other)
diff --git a/src/flamegpu/model/EnvironmentDescription.cpp b/src/flamegpu/model/EnvironmentDescription.cpp
index 3c95fb74a..b6b97e5d3 100644
--- a/src/flamegpu/model/EnvironmentDescription.cpp
+++ b/src/flamegpu/model/EnvironmentDescription.cpp
@@ -38,7 +38,7 @@ EnvironmentDescription::EnvironmentDescription(std::shared_ptr<EnvironmentData>
  * Accessors
  */
 void EnvironmentDescription::newProperty(const std::string &name, const char *ptr, size_t length, bool isConst, flamegpu::size_type elements, const std::type_index &type) {
-    environment->properties.emplace(name, EnvironmentData::PropData(isConst, util::Any(ptr, length, type, elements)));
+    environment->properties.emplace(name, EnvironmentData::PropData(isConst, detail::Any(ptr, length, type, elements)));
 }
 
 }  // namespace flamegpu
diff --git a/src/flamegpu/runtime/HostAPI.cu b/src/flamegpu/runtime/HostAPI.cu
index a1266d817..00f75c115 100644
--- a/src/flamegpu/runtime/HostAPI.cu
+++ b/src/flamegpu/runtime/HostAPI.cu
@@ -1,20 +1,20 @@
 #include "flamegpu/runtime/HostAPI.h"
-#include "flamegpu/runtime/HostAgentAPI.cuh"
+#include "flamegpu/runtime/agent/HostAgentAPI.cuh"
 #include "flamegpu/model/ModelDescription.h"
-#include "flamegpu/sim/Simulation.h"
+#include "flamegpu/simulation/Simulation.h"
 #include "flamegpu/util/nvtx.h"
-#include "flamegpu/gpu/CUDASimulation.h"
-#include "flamegpu/util/detail/cuda.cuh"
+#include "flamegpu/simulation/CUDASimulation.h"
+#include "flamegpu/detail/cuda.cuh"
 
 namespace flamegpu {
 
 HostAPI::HostAPI(CUDASimulation &_agentModel,
-    RandomManager& rng,
-    CUDAScatter &_scatter,
+    detail::RandomManager& rng,
+    detail::CUDAScatter &_scatter,
     const AgentOffsetMap &_agentOffsets,
     AgentDataMap &_agentData,
-    const std::shared_ptr<EnvironmentManager>& env,
-    CUDAMacroEnvironment &macro_env,
+    const std::shared_ptr<detail::EnvironmentManager>& env,
+    detail::CUDAMacroEnvironment &macro_env,
     const unsigned int _streamId,
     cudaStream_t _stream)
     : random(rng)
@@ -31,7 +31,7 @@ HostAPI::HostAPI(CUDASimulation &_agentModel,
 HostAPI::~HostAPI() {
     // @todo - cuda is not allowed in destructor
     if (d_output_space_size) {
-        gpuErrchk(flamegpu::util::detail::cuda::cudaFree(d_output_space));
+        gpuErrchk(flamegpu::detail::cuda::cudaFree(d_output_space));
         d_output_space_size = 0;
     }
 }
diff --git a/src/flamegpu/pop/DeviceAgentVector_impl.cu b/src/flamegpu/runtime/agent/DeviceAgentVector_impl.cu
similarity index 97%
rename from src/flamegpu/pop/DeviceAgentVector_impl.cu
rename to src/flamegpu/runtime/agent/DeviceAgentVector_impl.cu
index ac1f68235..7ad07ad8d 100644
--- a/src/flamegpu/pop/DeviceAgentVector_impl.cu
+++ b/src/flamegpu/runtime/agent/DeviceAgentVector_impl.cu
@@ -1,12 +1,12 @@
-#include "flamegpu/pop/DeviceAgentVector_impl.h"
-#include "flamegpu/gpu/CUDAAgent.h"
-#include "flamegpu/runtime/HostNewAgentAPI.h"
+#include "flamegpu/runtime/agent/DeviceAgentVector_impl.h"
+#include "flamegpu/simulation/detail/CUDAAgent.h"
+#include "flamegpu/runtime/agent/HostNewAgentAPI.h"
 
 namespace flamegpu {
 
-DeviceAgentVector_impl::DeviceAgentVector_impl(CUDAAgent& _cuda_agent, const std::string &_cuda_agent_state,
-    const VarOffsetStruct& _agentOffsets, std::vector<NewAgentStorage>& _newAgentData,
-    CUDAScatter& _scatter, const unsigned int _streamId, const cudaStream_t _stream)
+DeviceAgentVector_impl::DeviceAgentVector_impl(detail::CUDAAgent& _cuda_agent, const std::string &_cuda_agent_state,
+                                               const VarOffsetStruct& _agentOffsets, std::vector<NewAgentStorage>& _newAgentData,
+                                               detail::CUDAScatter& _scatter, const unsigned int _streamId, const cudaStream_t _stream)
     : AgentVector(_cuda_agent.getAgentDescription(), 0)
     , unbound_buffers_has_changed(false)
     , known_device_buffer_size(_cuda_agent.getStateSize(_cuda_agent_state))
diff --git a/src/flamegpu/runtime/HostAgentAPI.cu b/src/flamegpu/runtime/agent/HostAgentAPI.cu
similarity index 90%
rename from src/flamegpu/runtime/HostAgentAPI.cu
rename to src/flamegpu/runtime/agent/HostAgentAPI.cu
index efa014703..0bc5c6a3e 100644
--- a/src/flamegpu/runtime/HostAgentAPI.cu
+++ b/src/flamegpu/runtime/agent/HostAgentAPI.cu
@@ -1,5 +1,5 @@
-#include "flamegpu/runtime/HostAgentAPI.cuh"
-#include "flamegpu/pop/DeviceAgentVector_impl.h"
+#include "flamegpu/runtime/agent/HostAgentAPI.cuh"
+#include "flamegpu/runtime/agent/DeviceAgentVector_impl.h"
 
 namespace flamegpu {
 
@@ -48,7 +48,7 @@ DeviceAgentVector HostAgentAPI::getPopulationData() {
     std::shared_ptr<DeviceAgentVector_impl> d_vec = agent.getPopulationVec(stateName);
 
     if (!d_vec) {
-        d_vec = std::make_shared<DeviceAgentVector_impl>(static_cast<CUDAAgent&>(agent), stateName, agentOffsets, newAgentData, api.scatter, api.streamId, api.stream);
+        d_vec = std::make_shared<DeviceAgentVector_impl>(static_cast<detail::CUDAAgent&>(agent), stateName, agentOffsets, newAgentData, api.scatter, api.streamId, api.stream);
         agent.setPopulationVec(stateName, d_vec);
     }
     return *d_vec;
diff --git a/src/flamegpu/runtime/detail/curve/HostCurve.cu b/src/flamegpu/runtime/detail/curve/HostCurve.cu
index b06fd5cfe..9adf70af3 100644
--- a/src/flamegpu/runtime/detail/curve/HostCurve.cu
+++ b/src/flamegpu/runtime/detail/curve/HostCurve.cu
@@ -8,9 +8,9 @@
 #include "flamegpu/runtime/detail/curve/HostCurve.cuh"
 
 
-#include "flamegpu/gpu/detail/CUDAErrorChecking.cuh"
+#include "flamegpu/simulation/detail/CUDAErrorChecking.cuh"
 #include "flamegpu/util/nvtx.h"
-#include "flamegpu/util/detail/cuda.cuh"
+#include "flamegpu/detail/cuda.cuh"
 
 namespace flamegpu {
 namespace detail {
@@ -36,7 +36,7 @@ HostCurve::HostCurve()
 }
 HostCurve::~HostCurve() {
     if (d_curve_table) {
-        gpuErrchk(flamegpu::util::detail::cuda::cudaFree(d_curve_table));
+        gpuErrchk(flamegpu::detail::cuda::cudaFree(d_curve_table));
         d_curve_table = nullptr;
     }
 }
diff --git a/src/flamegpu/runtime/detail/curve/curve_rtc.cpp b/src/flamegpu/runtime/detail/curve/curve_rtc.cpp
index d0261e75b..a24d7ee2d 100644
--- a/src/flamegpu/runtime/detail/curve/curve_rtc.cpp
+++ b/src/flamegpu/runtime/detail/curve/curve_rtc.cpp
@@ -2,9 +2,9 @@
 
 #include "flamegpu/runtime/detail/curve/curve_rtc.cuh"
 #include "flamegpu/exception/FLAMEGPUException.h"
-#include "flamegpu/gpu/detail/CUDAErrorChecking.cuh"
-#include "flamegpu/runtime/utility/EnvironmentManager.cuh"
-#include "flamegpu/util/detail/cuda.cuh"
+#include "flamegpu/simulation/detail/CUDAErrorChecking.cuh"
+#include "flamegpu/simulation/detail/EnvironmentManager.cuh"
+#include "flamegpu/detail/cuda.cuh"
 
 // jitify include for demangle
 #ifdef _MSC_VER
@@ -26,7 +26,7 @@ const char* CurveRTCHost::curve_rtc_dynamic_h_template = R"###(dynamic/curve_rtc
 #define CURVE_RTC_DYNAMIC_H_
 
 #include "flamegpu/exception/FLAMEGPUDeviceException.cuh"
-#include "flamegpu/util/type_decode.h"
+#include "flamegpu/detail/type_decode.h"
 #include "flamegpu/runtime/detail/curve/Curve.cuh"
 
 namespace flamegpu {
@@ -175,8 +175,8 @@ __device__ __forceinline__ void DeviceCurve::setNewAgentArrayVariable(const char
 }  // namespace flamegpu 
 
 // has to be included after definition of curve namespace
-#include "flamegpu/runtime/utility/DeviceEnvironment.cuh"
-//#include "flamegpu/runtime/utility/DeviceMacroProperty.cuh"
+#include "flamegpu/runtime/environment/DeviceEnvironment.cuh"
+//#include "flamegpu/runtime/environment/DeviceMacroProperty.cuh"
 
 namespace flamegpu {
 
@@ -210,7 +210,7 @@ CurveRTCHost::CurveRTCHost() : header(CurveRTCHost::curve_rtc_dynamic_h_template
 }
 
 CurveRTCHost::~CurveRTCHost() {
-    gpuErrchk(flamegpu::util::detail::cuda::cudaFreeHost(h_data_buffer));
+    gpuErrchk(flamegpu::detail::cuda::cudaFreeHost(h_data_buffer));
 }
 
 void CurveRTCHost::registerAgentVariable(const char* variableName, const char* type, size_t type_size, unsigned int elements, bool read, bool write) {
@@ -384,10 +384,10 @@ void CurveRTCHost::initHeaderEnvironment(const size_t env_buffer_len) {
             {
                 getEnvVariableImpl <<   "    if (strings_equal(name, \"" << element.first << "\")) {\n";
                 getEnvVariableImpl <<   "#if !defined(FLAMEGPU_SEATBELTS) || FLAMEGPU_SEATBELTS\n";
-                getEnvVariableImpl <<   "        if(sizeof(type_decode<T>::type_t) != " << element.second.type_size << ") {\n";
+                getEnvVariableImpl <<   "        if(sizeof(detail::type_decode<T>::type_t) != " << element.second.type_size << ") {\n";
                 getEnvVariableImpl <<   "            DTHROW(\"Environment property '%s' type mismatch.\\n\", name);\n";
                 getEnvVariableImpl <<   "            return {};\n";
-                getEnvVariableImpl <<   "        } else if(type_decode<T>::len_t != " << element.second.elements << ") {\n";
+                getEnvVariableImpl <<   "        } else if(detail::type_decode<T>::len_t != " << element.second.elements << ") {\n";
                 getEnvVariableImpl <<   "            DTHROW(\"Environment property '%s' length mismatch.\\n\", name);\n";
                 getEnvVariableImpl <<   "            return {};\n";
                 getEnvVariableImpl <<   "        }\n";
@@ -410,11 +410,11 @@ void CurveRTCHost::initHeaderEnvironment(const size_t env_buffer_len) {
             if (props.elements > 1) {
                 getEnvArrayVariableImpl << "    if (strings_equal(name, \"" << element.first << "\")) {\n";
                 getEnvArrayVariableImpl << "#if !defined(FLAMEGPU_SEATBELTS) || FLAMEGPU_SEATBELTS\n";
-                getEnvArrayVariableImpl << "        const unsigned int t_index = type_decode<T>::len_t * index + type_decode<T>::len_t;\n";
-                getEnvArrayVariableImpl << "        if(sizeof(type_decode<T>::type_t) != " << element.second.type_size << ") {\n";
+                getEnvArrayVariableImpl << "        const unsigned int t_index = detail::type_decode<T>::len_t * index + detail::type_decode<T>::len_t;\n";
+                getEnvArrayVariableImpl << "        if(sizeof(detail::type_decode<T>::type_t) != " << element.second.type_size << ") {\n";
                 getEnvArrayVariableImpl << "            DTHROW(\"Environment array property '%s' type mismatch.\\n\", name);\n";
                 getEnvArrayVariableImpl << "            return {};\n";
-                getEnvArrayVariableImpl << "        } else if (type_decode<T>::len_t * N != " << element.second.elements << " && N != 0) {\n";  // Special case, env array specifying length is optional as it's not actually required
+                getEnvArrayVariableImpl << "        } else if (detail::type_decode<T>::len_t * N != " << element.second.elements << " && N != 0) {\n";  // Special case, env array specifying length is optional as it's not actually required
                 getEnvArrayVariableImpl << "            DTHROW(\"Environment array property '%s' length mismatch.\\n\", name);\n";
                 getEnvArrayVariableImpl << "            return {};\n";
                 getEnvArrayVariableImpl << "        } else if (t_index > " << element.second.elements << " || t_index < index) {\n";
@@ -513,10 +513,10 @@ void CurveRTCHost::initHeaderSetters() {
             if (props.write) {
                 setAgentVariableImpl << "          if (strings_equal(name, \"" << element.first << "\")) {\n";
                 setAgentVariableImpl << "#if !defined(FLAMEGPU_SEATBELTS) || FLAMEGPU_SEATBELTS\n";
-                setAgentVariableImpl << "              if(sizeof(type_decode<T>::type_t) != " << element.second.type_size << ") {\n";
+                setAgentVariableImpl << "              if(sizeof(detail::type_decode<T>::type_t) != " << element.second.type_size << ") {\n";
                 setAgentVariableImpl << "                  DTHROW(\"Agent variable '%s' type mismatch during setVariable().\\n\", name);\n";
                 setAgentVariableImpl << "                  return;\n";
-                setAgentVariableImpl << "              } else if(type_decode<T>::len_t != " << element.second.elements << ") {\n";
+                setAgentVariableImpl << "              } else if(detail::type_decode<T>::len_t != " << element.second.elements << ") {\n";
                 setAgentVariableImpl << "                  DTHROW(\"Agent variable '%s' length mismatch during setVariable().\\n\", name);\n";
                 setAgentVariableImpl << "                  return;\n";
                 setAgentVariableImpl << "              }\n";
@@ -540,10 +540,10 @@ void CurveRTCHost::initHeaderSetters() {
             if (props.write) {
                 setMessageVariableImpl << "          if (strings_equal(name, \"" << element.first << "\")) {\n";
                 setMessageVariableImpl << "#if !defined(FLAMEGPU_SEATBELTS) || FLAMEGPU_SEATBELTS\n";
-                setMessageVariableImpl << "              if(sizeof(type_decode<T>::type_t) != " << element.second.type_size << ") {\n";
+                setMessageVariableImpl << "              if(sizeof(detail::type_decode<T>::type_t) != " << element.second.type_size << ") {\n";
                 setMessageVariableImpl << "                  DTHROW(\"Message variable '%s' type mismatch during setVariable().\\n\", name);\n";
                 setMessageVariableImpl << "                  return;\n";
-                setMessageVariableImpl << "              } else if(type_decode<T>::len_t != " << element.second.elements << ") {\n";
+                setMessageVariableImpl << "              } else if(detail::type_decode<T>::len_t != " << element.second.elements << ") {\n";
                 setMessageVariableImpl << "                  DTHROW(\"Message variable '%s' length mismatch during setVariable().\\n\", name);\n";
                 setMessageVariableImpl << "                  return;\n";
                 setMessageVariableImpl << "              }\n";
@@ -567,10 +567,10 @@ void CurveRTCHost::initHeaderSetters() {
             if (props.write) {
                 setNewAgentVariableImpl << "          if (strings_equal(name, \"" << element.first << "\")) {\n";
                 setNewAgentVariableImpl << "#if !defined(FLAMEGPU_SEATBELTS) || FLAMEGPU_SEATBELTS\n";
-                setNewAgentVariableImpl << "              if(sizeof(type_decode<T>::type_t) != " << element.second.type_size << ") {\n";
+                setNewAgentVariableImpl << "              if(sizeof(detail::type_decode<T>::type_t) != " << element.second.type_size << ") {\n";
                 setNewAgentVariableImpl << "                  DTHROW(\"New agent variable '%s' type mismatch during setVariable().\\n\", name);\n";
                 setNewAgentVariableImpl << "                  return;\n";
-                setNewAgentVariableImpl << "              } else if(type_decode<T>::len_t != " << element.second.elements << ") {\n";
+                setNewAgentVariableImpl << "              } else if(detail::type_decode<T>::len_t != " << element.second.elements << ") {\n";
                 setNewAgentVariableImpl << "                  DTHROW(\"New agent variable '%s' length mismatch during setVariable().\\n\", name);\n";
                 setNewAgentVariableImpl << "                  return;\n";
                 setNewAgentVariableImpl << "              }\n";
@@ -590,17 +590,17 @@ void CurveRTCHost::initHeaderSetters() {
         size_t ct = 0;
         std::stringstream setAgentArrayVariableImpl;
         if (!agent_variables.empty())
-            setAgentArrayVariableImpl <<             "    const size_t i = (index * type_decode<T>::len_t * N) + type_decode<T>::len_t * array_index;\n";
+            setAgentArrayVariableImpl <<             "    const size_t i = (index * detail::type_decode<T>::len_t * N) + detail::type_decode<T>::len_t * array_index;\n";
         for (const auto &element : agent_variables) {
             RTCVariableProperties props = element.second;
             if (props.write && props.elements > 1) {
                 setAgentArrayVariableImpl << "          if (strings_equal(name, \"" << element.first << "\")) {\n";
                 setAgentArrayVariableImpl << "#if !defined(FLAMEGPU_SEATBELTS) || FLAMEGPU_SEATBELTS\n";
-                setAgentArrayVariableImpl << "              const unsigned int t_index = type_decode<T>::len_t * array_index + type_decode<T>::len_t;\n";
-                setAgentArrayVariableImpl << "              if(sizeof(type_decode<T>::type_t) != " << element.second.type_size << ") {\n";
+                setAgentArrayVariableImpl << "              const unsigned int t_index = detail::type_decode<T>::len_t * array_index + detail::type_decode<T>::len_t;\n";
+                setAgentArrayVariableImpl << "              if(sizeof(detail::type_decode<T>::type_t) != " << element.second.type_size << ") {\n";
                 setAgentArrayVariableImpl << "                  DTHROW(\"Agent array variable '%s' type mismatch during setVariable().\\n\", name);\n";
                 setAgentArrayVariableImpl << "                  return;\n";
-                setAgentArrayVariableImpl << "              } else if (type_decode<T>::len_t * N != " << element.second.elements << ") {\n";
+                setAgentArrayVariableImpl << "              } else if (detail::type_decode<T>::len_t * N != " << element.second.elements << ") {\n";
                 setAgentArrayVariableImpl << "                  DTHROW(\"Agent array variable '%s' length mismatch during setVariable().\\n\", name);\n";
                 setAgentArrayVariableImpl << "                  return;\n";
                 setAgentArrayVariableImpl << "              } else if (t_index > " << element.second.elements << " || t_index < array_index) {\n";
@@ -623,17 +623,17 @@ void CurveRTCHost::initHeaderSetters() {
         size_t ct = 0;
         std::stringstream setMessageArrayVariableImpl;
         if (!messageOut_variables.empty())
-            setMessageArrayVariableImpl << "    const size_t i = (index * type_decode<T>::len_t * N) + type_decode<T>::len_t * array_index;\n";
+            setMessageArrayVariableImpl << "    const size_t i = (index * detail::type_decode<T>::len_t * N) + detail::type_decode<T>::len_t * array_index;\n";
         for (const auto& element : messageOut_variables) {
             RTCVariableProperties props = element.second;
             if (props.write && props.elements > 1) {
                 setMessageArrayVariableImpl << "          if (strings_equal(name, \"" << element.first << "\")) {\n";
                 setMessageArrayVariableImpl << "#if !defined(FLAMEGPU_SEATBELTS) || FLAMEGPU_SEATBELTS\n";
-                setMessageArrayVariableImpl << "              const unsigned int t_index = type_decode<T>::len_t * array_index + type_decode<T>::len_t;\n";
-                setMessageArrayVariableImpl << "              if(sizeof(type_decode<T>::type_t) != " << element.second.type_size << ") {\n";
+                setMessageArrayVariableImpl << "              const unsigned int t_index = detail::type_decode<T>::len_t * array_index + detail::type_decode<T>::len_t;\n";
+                setMessageArrayVariableImpl << "              if(sizeof(detail::type_decode<T>::type_t) != " << element.second.type_size << ") {\n";
                 setMessageArrayVariableImpl << "                  DTHROW(\"Message array variable '%s' type mismatch during setVariable().\\n\", name);\n";
                 setMessageArrayVariableImpl << "                  return;\n";
-                setMessageArrayVariableImpl << "              } else if (type_decode<T>::len_t * N != " << element.second.elements << ") {\n";
+                setMessageArrayVariableImpl << "              } else if (detail::type_decode<T>::len_t * N != " << element.second.elements << ") {\n";
                 setMessageArrayVariableImpl << "                  DTHROW(\"Message array variable '%s' length mismatch during setVariable().\\n\", name);\n";
                 setMessageArrayVariableImpl << "                  return;\n";
                 setMessageArrayVariableImpl << "              } else if (t_index > " << element.second.elements << " || t_index < array_index) {\n";
@@ -656,17 +656,17 @@ void CurveRTCHost::initHeaderSetters() {
         size_t ct = 0;
         std::stringstream setNewAgentArrayVariableImpl;
         if (!newAgent_variables.empty())
-            setNewAgentArrayVariableImpl << "    const size_t i = (index * type_decode<T>::len_t * N) + type_decode<T>::len_t * array_index;\n";
+            setNewAgentArrayVariableImpl << "    const size_t i = (index * detail::type_decode<T>::len_t * N) + detail::type_decode<T>::len_t * array_index;\n";
         for (const auto &element : newAgent_variables) {
             RTCVariableProperties props = element.second;
             if (props.write && props.elements > 1) {
                 setNewAgentArrayVariableImpl << "          if (strings_equal(name, \"" << element.first << "\")) {\n";
                 setNewAgentArrayVariableImpl << "#if !defined(FLAMEGPU_SEATBELTS) || FLAMEGPU_SEATBELTS\n";
-                setNewAgentArrayVariableImpl << "              const unsigned int t_index = type_decode<T>::len_t * array_index + type_decode<T>::len_t;\n";
-                setNewAgentArrayVariableImpl << "              if(sizeof(type_decode<T>::type_t) != " << element.second.type_size << ") {\n";
+                setNewAgentArrayVariableImpl << "              const unsigned int t_index = detail::type_decode<T>::len_t * array_index + detail::type_decode<T>::len_t;\n";
+                setNewAgentArrayVariableImpl << "              if(sizeof(detail::type_decode<T>::type_t) != " << element.second.type_size << ") {\n";
                 setNewAgentArrayVariableImpl << "                  DTHROW(\"New agent array variable '%s' type mismatch during setVariable().\\n\", name);\n";
                 setNewAgentArrayVariableImpl << "                  return;\n";
-                setNewAgentArrayVariableImpl << "              } else if (type_decode<T>::len_t * N != " << element.second.elements << ") {\n";
+                setNewAgentArrayVariableImpl << "              } else if (detail::type_decode<T>::len_t * N != " << element.second.elements << ") {\n";
                 setNewAgentArrayVariableImpl << "                  DTHROW(\"New agent array variable '%s' length mismatch during setVariable().\\n\", name);\n";
                 setNewAgentArrayVariableImpl << "                  return;\n";
                 setNewAgentArrayVariableImpl << "              } else if (t_index > " << element.second.elements << " || t_index < array_index) {\n";
@@ -695,10 +695,10 @@ void CurveRTCHost::initHeaderGetters() {
             if (props.read) {
                 getAgentVariableImpl << "            if (strings_equal(name, \"" << element.first << "\")) {\n";
                 getAgentVariableImpl << "#if !defined(FLAMEGPU_SEATBELTS) || FLAMEGPU_SEATBELTS\n";
-                getAgentVariableImpl << "                if(sizeof(type_decode<T>::type_t) != " << element.second.type_size << ") {\n";
+                getAgentVariableImpl << "                if(sizeof(detail::type_decode<T>::type_t) != " << element.second.type_size << ") {\n";
                 getAgentVariableImpl << "                    DTHROW(\"Agent variable '%s' type mismatch during getVariable().\\n\", name);\n";
                 getAgentVariableImpl << "                    return {};\n";
-                getAgentVariableImpl << "                } else if(type_decode<T>::len_t != " << element.second.elements << ") {\n";
+                getAgentVariableImpl << "                } else if(detail::type_decode<T>::len_t != " << element.second.elements << ") {\n";
                 getAgentVariableImpl << "                    DTHROW(\"Agent variable '%s' length mismatch during getVariable().\\n\", name);\n";
                 getAgentVariableImpl << "                    return {};\n";
                 getAgentVariableImpl << "                }\n";
@@ -722,10 +722,10 @@ void CurveRTCHost::initHeaderGetters() {
             if (props.read) {
                 getMessageVariableImpl << "            if (strings_equal(name, \"" << element.first << "\")) {\n";
                 getMessageVariableImpl << "#if !defined(FLAMEGPU_SEATBELTS) || FLAMEGPU_SEATBELTS\n";
-                getMessageVariableImpl << "                if(sizeof(type_decode<T>::type_t) != " << element.second.type_size << ") {\n";
+                getMessageVariableImpl << "                if(sizeof(detail::type_decode<T>::type_t) != " << element.second.type_size << ") {\n";
                 getMessageVariableImpl << "                    DTHROW(\"Message variable '%s' type mismatch during getVariable().\\n\", name);\n";
                 getMessageVariableImpl << "                    return {};\n";
-                getMessageVariableImpl << "                } else if(type_decode<T>::len_t != " << element.second.elements << ") {\n";
+                getMessageVariableImpl << "                } else if(detail::type_decode<T>::len_t != " << element.second.elements << ") {\n";
                 getMessageVariableImpl << "                    DTHROW(\"Message variable '%s' length mismatch during getVariable().\\n\", name);\n";
                 getMessageVariableImpl << "                    return {};\n";
                 getMessageVariableImpl << "                }\n";
@@ -795,17 +795,17 @@ void CurveRTCHost::initHeaderGetters() {
         size_t ct = 0;
         std::stringstream getAgentArrayVariableImpl;
         if (!agent_variables.empty())
-            getAgentArrayVariableImpl << "    const size_t i = (index * type_decode<T>::len_t * N) + type_decode<T>::len_t * array_index;\n";
+            getAgentArrayVariableImpl << "    const size_t i = (index * detail::type_decode<T>::len_t * N) + detail::type_decode<T>::len_t * array_index;\n";
         for (const auto &element : agent_variables) {
             RTCVariableProperties props = element.second;
             if (props.read && props.elements > 1) {
                 getAgentArrayVariableImpl << "          if (strings_equal(name, \"" << element.first << "\")) {\n";
                 getAgentArrayVariableImpl << "#if !defined(FLAMEGPU_SEATBELTS) || FLAMEGPU_SEATBELTS\n";
-                getAgentArrayVariableImpl << "              const unsigned int t_index = type_decode<T>::len_t * array_index + type_decode<T>::len_t;\n";
-                getAgentArrayVariableImpl << "              if(sizeof(type_decode<T>::type_t) != " << element.second.type_size << ") {\n";
+                getAgentArrayVariableImpl << "              const unsigned int t_index = detail::type_decode<T>::len_t * array_index + detail::type_decode<T>::len_t;\n";
+                getAgentArrayVariableImpl << "              if(sizeof(detail::type_decode<T>::type_t) != " << element.second.type_size << ") {\n";
                 getAgentArrayVariableImpl << "                  DTHROW(\"Agent array variable '%s' type mismatch during getVariable().\\n\", name);\n";
                 getAgentArrayVariableImpl << "                  return {};\n";
-                getAgentArrayVariableImpl << "              } else if (type_decode<T>::len_t * N != " << element.second.elements << ") {\n";
+                getAgentArrayVariableImpl << "              } else if (detail::type_decode<T>::len_t * N != " << element.second.elements << ") {\n";
                 getAgentArrayVariableImpl << "                  DTHROW(\"Agent array variable '%s' length mismatch during getVariable().\\n\", name);\n";
                 getAgentArrayVariableImpl << "                  return {};\n";
                 getAgentArrayVariableImpl << "              } else if (t_index > " << element.second.elements << " || t_index < array_index) {\n";
@@ -828,17 +828,17 @@ void CurveRTCHost::initHeaderGetters() {
         size_t ct = 0;
         std::stringstream getMessageArrayVariableImpl;
         if (!messageIn_variables.empty())
-            getMessageArrayVariableImpl << "    const size_t i = (index * type_decode<T>::len_t * N) + type_decode<T>::len_t * array_index;\n";
+            getMessageArrayVariableImpl << "    const size_t i = (index * detail::type_decode<T>::len_t * N) + detail::type_decode<T>::len_t * array_index;\n";
         for (const auto& element : messageIn_variables) {
             RTCVariableProperties props = element.second;
             if (props.read && props.elements > 1) {
                 getMessageArrayVariableImpl << "          if (strings_equal(name, \"" << element.first << "\")) {\n";
                 getMessageArrayVariableImpl << "#if !defined(FLAMEGPU_SEATBELTS) || FLAMEGPU_SEATBELTS\n";
-                getMessageArrayVariableImpl << "              const unsigned int t_index = type_decode<T>::len_t * array_index + type_decode<T>::len_t;\n";
-                getMessageArrayVariableImpl << "              if(sizeof(type_decode<T>::type_t) != " << element.second.type_size << ") {\n";
+                getMessageArrayVariableImpl << "              const unsigned int t_index = detail::type_decode<T>::len_t * array_index + detail::type_decode<T>::len_t;\n";
+                getMessageArrayVariableImpl << "              if(sizeof(detail::type_decode<T>::type_t) != " << element.second.type_size << ") {\n";
                 getMessageArrayVariableImpl << "                  DTHROW(\"Message array variable '%s' type mismatch during getVariable().\\n\", name);\n";
                 getMessageArrayVariableImpl << "                  return {};\n";
-                getMessageArrayVariableImpl << "              } else if (type_decode<T>::len_t * N != " << element.second.elements << ") {\n";
+                getMessageArrayVariableImpl << "              } else if (detail::type_decode<T>::len_t * N != " << element.second.elements << ") {\n";
                 getMessageArrayVariableImpl << "                  DTHROW(\"Message array variable '%s' length mismatch during getVariable().\\n\", name);\n";
                 getMessageArrayVariableImpl << "                  return {};\n";
                 getMessageArrayVariableImpl << "              } else if (t_index > " << element.second.elements << " || t_index < array_index) {\n";
diff --git a/src/flamegpu/runtime/utility/HostEnvironment.cu b/src/flamegpu/runtime/environment/HostEnvironment.cu
similarity index 55%
rename from src/flamegpu/runtime/utility/HostEnvironment.cu
rename to src/flamegpu/runtime/environment/HostEnvironment.cu
index f69d3d902..db068fb6d 100644
--- a/src/flamegpu/runtime/utility/HostEnvironment.cu
+++ b/src/flamegpu/runtime/environment/HostEnvironment.cu
@@ -1,8 +1,8 @@
-#include "flamegpu/runtime/utility/HostEnvironment.cuh"
+#include "flamegpu/runtime/environment/HostEnvironment.cuh"
 
 namespace flamegpu {
 
-HostEnvironment::HostEnvironment(const unsigned int _instance_id, const std::shared_ptr<EnvironmentManager> &env, CUDAMacroEnvironment& _macro_env)
+HostEnvironment::HostEnvironment(const unsigned int _instance_id, const std::shared_ptr<detail::EnvironmentManager> &env, detail::CUDAMacroEnvironment& _macro_env)
     : env_mgr(env)
     , macro_env(_macro_env)
     , instance_id(_instance_id) { }
diff --git a/src/flamegpu/runtime/messaging/MessageArray.cu b/src/flamegpu/runtime/messaging/MessageArray.cu
index 954694194..a2130ae8e 100644
--- a/src/flamegpu/runtime/messaging/MessageArray.cu
+++ b/src/flamegpu/runtime/messaging/MessageArray.cu
@@ -1,11 +1,11 @@
 #include "flamegpu/runtime/messaging/MessageArray.h"
 #include "flamegpu/model/AgentDescription.h"  // Used by Move-Assign
-#include "flamegpu/gpu/CUDAMessage.h"
-#include "flamegpu/gpu/CUDAScatter.cuh"
+#include "flamegpu/simulation/detail/CUDAMessage.h"
+#include "flamegpu/simulation/detail/CUDAScatter.cuh"
 
 #include "flamegpu/runtime/messaging/MessageArray/MessageArrayHost.h"
 // #include "flamegpu/runtime/messaging/MessageArray/MessageArrayDevice.cuh"
-#include "flamegpu/util/detail/cuda.cuh"
+#include "flamegpu/detail/cuda.cuh"
 
 namespace flamegpu {
 
@@ -14,7 +14,7 @@ namespace flamegpu {
  * Allocates memory on device for message list length
  * @param a Parent CUDAMessage, used to access message settings, data ptrs etc
  */
-MessageArray::CUDAModelHandler::CUDAModelHandler(CUDAMessage &a)
+MessageArray::CUDAModelHandler::CUDAModelHandler(detail::CUDAMessage &a)
     : MessageSpecialisationHandler()
     , d_metadata(nullptr)
     , sim_message(a)
@@ -24,7 +24,7 @@ MessageArray::CUDAModelHandler::CUDAModelHandler(CUDAMessage &a)
     hd_metadata.length = d.length;
 }
 
-void MessageArray::CUDAModelHandler::init(CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream) {
+void MessageArray::CUDAModelHandler::init(detail::CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream) {
     allocateMetaDataDevicePtr(stream);
     // Allocate messages
     this->sim_message.resize(hd_metadata.length, scatter, stream, streamId);
@@ -50,17 +50,17 @@ void MessageArray::CUDAModelHandler::allocateMetaDataDevicePtr(cudaStream_t stre
 
 void MessageArray::CUDAModelHandler::freeMetaDataDevicePtr() {
     if (d_metadata != nullptr) {
-        gpuErrchk(flamegpu::util::detail::cuda::cudaFree(d_metadata));
+        gpuErrchk(flamegpu::detail::cuda::cudaFree(d_metadata));
     }
     d_metadata = nullptr;
 
     if (d_write_flag) {
-        gpuErrchk(flamegpu::util::detail::cuda::cudaFree(d_write_flag));
+        gpuErrchk(flamegpu::detail::cuda::cudaFree(d_write_flag));
     }
     d_write_flag = nullptr;
     d_write_flag_len = 0;
 }
-void MessageArray::CUDAModelHandler::buildIndex(CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream) {
+void MessageArray::CUDAModelHandler::buildIndex(detail::CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream) {
     const unsigned int MESSAGE_COUNT = this->sim_message.getMessageCount();
     // Zero the output arrays
     auto &read_list = this->sim_message.getReadList();
@@ -78,7 +78,7 @@ void MessageArray::CUDAModelHandler::buildIndex(CUDAScatter &scatter, unsigned i
         if (d_write_flag_len < MESSAGE_COUNT) {
             // Increase length
             if (d_write_flag) {
-                gpuErrchk(flamegpu::util::detail::cuda::cudaFree(d_write_flag));
+                gpuErrchk(flamegpu::detail::cuda::cudaFree(d_write_flag));
             }
             d_write_flag_len = static_cast<unsigned int>(MESSAGE_COUNT * 1.1f);
             gpuErrchk(cudaMalloc(&d_write_flag, sizeof(unsigned int) * d_write_flag_len));
@@ -150,7 +150,7 @@ MessageArray::Data::Data(std::shared_ptr<const ModelData> model, const Data &oth
 MessageArray::Data *MessageArray::Data::clone(const std::shared_ptr<const ModelData> &newParent) {
     return new Data(newParent, *this);
 }
-std::unique_ptr<MessageSpecialisationHandler> MessageArray::Data::getSpecialisationHander(CUDAMessage &owner) const {
+std::unique_ptr<MessageSpecialisationHandler> MessageArray::Data::getSpecialisationHander(detail::CUDAMessage &owner) const {
     return std::unique_ptr<MessageSpecialisationHandler>(new CUDAModelHandler(owner));
 }
 std::type_index MessageArray::Data::getType() const { return std::type_index(typeid(MessageArray)); }
diff --git a/src/flamegpu/runtime/messaging/MessageArray2D.cu b/src/flamegpu/runtime/messaging/MessageArray2D.cu
index defc0bdfd..fed8a3191 100644
--- a/src/flamegpu/runtime/messaging/MessageArray2D.cu
+++ b/src/flamegpu/runtime/messaging/MessageArray2D.cu
@@ -1,11 +1,11 @@
 #include "flamegpu/runtime/messaging/MessageArray2D.h"
 #include "flamegpu/model/AgentDescription.h"  // Used by Move-Assign
-#include "flamegpu/gpu/CUDAMessage.h"
-#include "flamegpu/gpu/CUDAScatter.cuh"
+#include "flamegpu/simulation/detail/CUDAMessage.h"
+#include "flamegpu/simulation/detail/CUDAScatter.cuh"
 
 #include "flamegpu/runtime/messaging/MessageArray2D/MessageArray2DHost.h"
 // #include "flamegpu/runtime/messaging/MessageArray2D/MessageArray2DDevice.cuh"
-#include "flamegpu/util/detail/cuda.cuh"
+#include "flamegpu/detail/cuda.cuh"
 
 namespace flamegpu {
 
@@ -14,7 +14,7 @@ namespace flamegpu {
  * Allocates memory on device for message list length
  * @param a Parent CUDAMessage, used to access message settings, data ptrs etc
  */
-MessageArray2D::CUDAModelHandler::CUDAModelHandler(CUDAMessage &a)
+MessageArray2D::CUDAModelHandler::CUDAModelHandler(detail::CUDAMessage &a)
     : MessageSpecialisationHandler()
     , d_metadata(nullptr)
     , sim_message(a)
@@ -25,7 +25,7 @@ MessageArray2D::CUDAModelHandler::CUDAModelHandler(CUDAMessage &a)
     hd_metadata.length = d.dimensions[0] * d.dimensions[1];
 }
 
-void MessageArray2D::CUDAModelHandler::init(CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream) {
+void MessageArray2D::CUDAModelHandler::init(detail::CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream) {
     allocateMetaDataDevicePtr(stream);
     // Allocate messages
     this->sim_message.resize(hd_metadata.length, scatter, stream, streamId);
@@ -51,17 +51,17 @@ void MessageArray2D::CUDAModelHandler::allocateMetaDataDevicePtr(cudaStream_t st
 
 void MessageArray2D::CUDAModelHandler::freeMetaDataDevicePtr() {
     if (d_metadata != nullptr) {
-        gpuErrchk(flamegpu::util::detail::cuda::cudaFree(d_metadata));
+        gpuErrchk(flamegpu::detail::cuda::cudaFree(d_metadata));
     }
     d_metadata = nullptr;
 
     if (d_write_flag) {
-        gpuErrchk(flamegpu::util::detail::cuda::cudaFree(d_write_flag));
+        gpuErrchk(flamegpu::detail::cuda::cudaFree(d_write_flag));
     }
     d_write_flag = nullptr;
     d_write_flag_len = 0;
 }
-void MessageArray2D::CUDAModelHandler::buildIndex(CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream) {
+void MessageArray2D::CUDAModelHandler::buildIndex(detail::CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream) {
     const unsigned int MESSAGE_COUNT = this->sim_message.getMessageCount();
     // Zero the output arrays
     auto &read_list = this->sim_message.getReadList();
@@ -79,7 +79,7 @@ void MessageArray2D::CUDAModelHandler::buildIndex(CUDAScatter &scatter, unsigned
         if (d_write_flag_len < MESSAGE_COUNT) {
             // Increase length
             if (d_write_flag) {
-                gpuErrchk(flamegpu::util::detail::cuda::cudaFree(d_write_flag));
+                gpuErrchk(flamegpu::detail::cuda::cudaFree(d_write_flag));
             }
             d_write_flag_len = static_cast<unsigned int>(MESSAGE_COUNT * 1.1f);
             gpuErrchk(cudaMalloc(&d_write_flag, sizeof(unsigned int) * d_write_flag_len));
@@ -160,7 +160,7 @@ MessageArray2D::Data::Data(std::shared_ptr<const ModelData> model, const Data &o
 MessageArray2D::Data *MessageArray2D::Data::clone(const std::shared_ptr<const ModelData> &newParent) {
     return new Data(newParent, *this);
 }
-std::unique_ptr<MessageSpecialisationHandler> MessageArray2D::Data::getSpecialisationHander(CUDAMessage &owner) const {
+std::unique_ptr<MessageSpecialisationHandler> MessageArray2D::Data::getSpecialisationHander(detail::CUDAMessage &owner) const {
     return std::unique_ptr<MessageSpecialisationHandler>(new CUDAModelHandler(owner));
 }
 std::type_index MessageArray2D::Data::getType() const { return std::type_index(typeid(MessageArray2D)); }
diff --git a/src/flamegpu/runtime/messaging/MessageArray3D.cu b/src/flamegpu/runtime/messaging/MessageArray3D.cu
index 8c245e823..8c9a227bc 100644
--- a/src/flamegpu/runtime/messaging/MessageArray3D.cu
+++ b/src/flamegpu/runtime/messaging/MessageArray3D.cu
@@ -1,11 +1,11 @@
 #include "flamegpu/runtime/messaging/MessageArray3D.h"
 #include "flamegpu/model/AgentDescription.h"  // Used by Move-Assign
-#include "flamegpu/gpu/CUDAMessage.h"
-#include "flamegpu/gpu/CUDAScatter.cuh"
+#include "flamegpu/simulation/detail/CUDAMessage.h"
+#include "flamegpu/simulation/detail/CUDAScatter.cuh"
 
 #include "flamegpu/runtime/messaging/MessageArray3D/MessageArray3DHost.h"
 // #include "flamegpu/runtime/messaging/MessageArray3D/MessageArray3DDevice.cuh"
-#include "flamegpu/util/detail/cuda.cuh"
+#include "flamegpu/detail/cuda.cuh"
 
 namespace flamegpu {
 
@@ -14,7 +14,7 @@ namespace flamegpu {
  * Allocates memory on device for message list length
  * @param a Parent CUDAMessage, used to access message settings, data ptrs etc
  */
-MessageArray3D::CUDAModelHandler::CUDAModelHandler(CUDAMessage &a)
+MessageArray3D::CUDAModelHandler::CUDAModelHandler(detail::CUDAMessage &a)
     : MessageSpecialisationHandler()
     , d_metadata(nullptr)
     , sim_message(a)
@@ -25,7 +25,7 @@ MessageArray3D::CUDAModelHandler::CUDAModelHandler(CUDAMessage &a)
     hd_metadata.length = d.dimensions[0] * d.dimensions[1] * d.dimensions[2];
 }
 
-void MessageArray3D::CUDAModelHandler::init(CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream) {
+void MessageArray3D::CUDAModelHandler::init(detail::CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream) {
     allocateMetaDataDevicePtr(stream);
     // Allocate messages
     this->sim_message.resize(hd_metadata.length, scatter, stream, streamId);
@@ -51,17 +51,17 @@ void MessageArray3D::CUDAModelHandler::allocateMetaDataDevicePtr(cudaStream_t st
 
 void MessageArray3D::CUDAModelHandler::freeMetaDataDevicePtr() {
     if (d_metadata != nullptr) {
-        gpuErrchk(flamegpu::util::detail::cuda::cudaFree(d_metadata));
+        gpuErrchk(flamegpu::detail::cuda::cudaFree(d_metadata));
     }
     d_metadata = nullptr;
 
     if (d_write_flag) {
-        gpuErrchk(flamegpu::util::detail::cuda::cudaFree(d_write_flag));
+        gpuErrchk(flamegpu::detail::cuda::cudaFree(d_write_flag));
     }
     d_write_flag = nullptr;
     d_write_flag_len = 0;
 }
-void MessageArray3D::CUDAModelHandler::buildIndex(CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream) {
+void MessageArray3D::CUDAModelHandler::buildIndex(detail::CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream) {
     const unsigned int MESSAGE_COUNT = this->sim_message.getMessageCount();
     // Zero the output arrays
     auto &read_list = this->sim_message.getReadList();
@@ -79,7 +79,7 @@ void MessageArray3D::CUDAModelHandler::buildIndex(CUDAScatter &scatter, unsigned
         if (d_write_flag_len < MESSAGE_COUNT) {
             // Increase length
             if (d_write_flag) {
-                gpuErrchk(flamegpu::util::detail::cuda::cudaFree(d_write_flag));
+                gpuErrchk(flamegpu::detail::cuda::cudaFree(d_write_flag));
             }
             d_write_flag_len = static_cast<unsigned int>(MESSAGE_COUNT * 1.1f);
             gpuErrchk(cudaMalloc(&d_write_flag, sizeof(unsigned int) * d_write_flag_len));
@@ -163,7 +163,7 @@ MessageArray3D::Data::Data(std::shared_ptr<const ModelData> model, const Data &o
 MessageArray3D::Data *MessageArray3D::Data::clone(const std::shared_ptr<const ModelData> &newParent) {
     return new Data(newParent, *this);
 }
-std::unique_ptr<MessageSpecialisationHandler> MessageArray3D::Data::getSpecialisationHander(CUDAMessage &owner) const {
+std::unique_ptr<MessageSpecialisationHandler> MessageArray3D::Data::getSpecialisationHander(detail::CUDAMessage &owner) const {
     return std::unique_ptr<MessageSpecialisationHandler>(new CUDAModelHandler(owner));
 }
 std::type_index MessageArray3D::Data::getType() const { return std::type_index(typeid(MessageArray3D)); }
diff --git a/src/flamegpu/runtime/messaging/MessageBruteForce.cu b/src/flamegpu/runtime/messaging/MessageBruteForce.cu
index bfe59fa9a..38af3c213 100644
--- a/src/flamegpu/runtime/messaging/MessageBruteForce.cu
+++ b/src/flamegpu/runtime/messaging/MessageBruteForce.cu
@@ -1,11 +1,11 @@
 #include "flamegpu/runtime/messaging/MessageBruteForce/MessageBruteForceHost.h"
 #include "flamegpu/runtime/messaging/MessageBruteForce/MessageBruteForceDevice.cuh"
 #include "flamegpu/model/AgentDescription.h"  // Used by Move-Assign
-#include "flamegpu/gpu/CUDAMessage.h"
-#include "flamegpu/util/detail/cuda.cuh"
+#include "flamegpu/simulation/detail/CUDAMessage.h"
+#include "flamegpu/detail/cuda.cuh"
 
 namespace flamegpu {
-void MessageBruteForce::CUDAModelHandler::init(CUDAScatter &, unsigned int, cudaStream_t stream) {
+void MessageBruteForce::CUDAModelHandler::init(detail::CUDAScatter &, unsigned int, cudaStream_t stream) {
     allocateMetaDataDevicePtr(stream);
     // Allocate messages
     hd_metadata.length = 0;  // This value should already be 0
@@ -21,12 +21,12 @@ void MessageBruteForce::CUDAModelHandler::allocateMetaDataDevicePtr(cudaStream_t
 
 void MessageBruteForce::CUDAModelHandler::freeMetaDataDevicePtr() {
     if (d_metadata != nullptr) {
-        gpuErrchk(flamegpu::util::detail::cuda::cudaFree(d_metadata));
+        gpuErrchk(flamegpu::detail::cuda::cudaFree(d_metadata));
     }
     d_metadata = nullptr;
 }
 
-void MessageBruteForce::CUDAModelHandler::buildIndex(CUDAScatter &, unsigned int, cudaStream_t stream) {
+void MessageBruteForce::CUDAModelHandler::buildIndex(detail::CUDAScatter &, unsigned int, cudaStream_t stream) {
     unsigned int newLength = this->sim_message.getMessageCount();
     if (newLength != hd_metadata.length) {
         hd_metadata.length = newLength;
@@ -78,7 +78,7 @@ bool MessageBruteForce::Data::operator!=(const MessageBruteForce::Data& rhs) con
     return !operator==(rhs);
 }
 
-std::unique_ptr<MessageSpecialisationHandler> MessageBruteForce::Data::getSpecialisationHander(CUDAMessage &owner) const {
+std::unique_ptr<MessageSpecialisationHandler> MessageBruteForce::Data::getSpecialisationHander(detail::CUDAMessage &owner) const {
     return std::unique_ptr<MessageSpecialisationHandler>(new MessageBruteForce::CUDAModelHandler(owner));
 }
 
diff --git a/src/flamegpu/runtime/messaging/MessageBucket.cu b/src/flamegpu/runtime/messaging/MessageBucket.cu
index 00e3317bf..26696ef95 100644
--- a/src/flamegpu/runtime/messaging/MessageBucket.cu
+++ b/src/flamegpu/runtime/messaging/MessageBucket.cu
@@ -10,16 +10,16 @@
 #endif
 
 #include "flamegpu/model/AgentDescription.h"
-#include "flamegpu/gpu/CUDAMessage.h"
-#include "flamegpu/gpu/CUDAScatter.cuh"
+#include "flamegpu/simulation/detail/CUDAMessage.h"
+#include "flamegpu/simulation/detail/CUDAScatter.cuh"
 #include "flamegpu/util/nvtx.h"
 
 #include "flamegpu/runtime/messaging/MessageBucket/MessageBucketHost.h"
 // #include "flamegpu/runtime/messaging/MessageBucket/MessageBucketDevice.cuh"
-#include "flamegpu/util/detail/cuda.cuh"
+#include "flamegpu/detail/cuda.cuh"
 
 namespace flamegpu {
-MessageBucket::CUDAModelHandler::CUDAModelHandler(CUDAMessage &a)
+MessageBucket::CUDAModelHandler::CUDAModelHandler(detail::CUDAMessage &a)
     : MessageSpecialisationHandler()
     , sim_message(a) {
     flamegpu::util::nvtx::Range range{"MessageBucket::CUDAModelHandler::CUDAModelHandler"};
@@ -48,7 +48,7 @@ __global__ void atomicHistogram1D(
     bin_sub_index[index] = bin_idx;
 }
 
-void MessageBucket::CUDAModelHandler::init(CUDAScatter &, unsigned int, cudaStream_t stream) {
+void MessageBucket::CUDAModelHandler::init(detail::CUDAScatter &, unsigned int, cudaStream_t stream) {
     allocateMetaDataDevicePtr(stream);
     // Set PBM to 0
     gpuErrchk(cudaMemsetAsync(hd_data.PBM, 0x00000000, (bucketCount + 1) * sizeof(unsigned int), stream));
@@ -69,25 +69,25 @@ void MessageBucket::CUDAModelHandler::allocateMetaDataDevicePtr(cudaStream_t str
 void MessageBucket::CUDAModelHandler::freeMetaDataDevicePtr() {
     if (d_data != nullptr) {
         d_CUB_temp_storage_bytes = 0;
-        gpuErrchk(flamegpu::util::detail::cuda::cudaFree(d_CUB_temp_storage));
-        gpuErrchk(flamegpu::util::detail::cuda::cudaFree(d_histogram));
-        gpuErrchk(flamegpu::util::detail::cuda::cudaFree(hd_data.PBM));
-        gpuErrchk(flamegpu::util::detail::cuda::cudaFree(d_data));
+        gpuErrchk(flamegpu::detail::cuda::cudaFree(d_CUB_temp_storage));
+        gpuErrchk(flamegpu::detail::cuda::cudaFree(d_histogram));
+        gpuErrchk(flamegpu::detail::cuda::cudaFree(hd_data.PBM));
+        gpuErrchk(flamegpu::detail::cuda::cudaFree(d_data));
         d_CUB_temp_storage = nullptr;
         d_histogram = nullptr;
         hd_data.PBM = nullptr;
         d_data = nullptr;
         if (d_keys) {
             d_keys_vals_storage_bytes = 0;
-            gpuErrchk(flamegpu::util::detail::cuda::cudaFree(d_keys));
-            gpuErrchk(flamegpu::util::detail::cuda::cudaFree(d_vals));
+            gpuErrchk(flamegpu::detail::cuda::cudaFree(d_keys));
+            gpuErrchk(flamegpu::detail::cuda::cudaFree(d_vals));
             d_keys = nullptr;
             d_vals = nullptr;
         }
     }
 }
 
-void MessageBucket::CUDAModelHandler::buildIndex(CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream) {
+void MessageBucket::CUDAModelHandler::buildIndex(detail::CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream) {
     flamegpu::util::nvtx::Range range{"MessageBucket::CUDAModelHandler::buildIndex"};
     // Cuda operations all occur within the stream, so only a final sync is required.s
     const unsigned int MESSAGE_COUNT = this->sim_message.getMessageCount();
@@ -121,7 +121,7 @@ void MessageBucket::CUDAModelHandler::resizeCubTemp() {
     gpuErrchk(cub::DeviceScan::ExclusiveSum(nullptr, bytesCheck, hd_data.PBM, d_histogram, bucketCount + 1));
     if (bytesCheck > d_CUB_temp_storage_bytes) {
         if (d_CUB_temp_storage) {
-            gpuErrchk(flamegpu::util::detail::cuda::cudaFree(d_CUB_temp_storage));
+            gpuErrchk(flamegpu::detail::cuda::cudaFree(d_CUB_temp_storage));
         }
         d_CUB_temp_storage_bytes = bytesCheck;
         gpuErrchk(cudaMalloc(&d_CUB_temp_storage, d_CUB_temp_storage_bytes));
@@ -132,8 +132,8 @@ void MessageBucket::CUDAModelHandler::resizeKeysVals(const unsigned int newSize)
     size_t bytesCheck = newSize * sizeof(unsigned int);
     if (bytesCheck > d_keys_vals_storage_bytes) {
         if (d_keys) {
-            gpuErrchk(flamegpu::util::detail::cuda::cudaFree(d_keys));
-            gpuErrchk(flamegpu::util::detail::cuda::cudaFree(d_vals));
+            gpuErrchk(flamegpu::detail::cuda::cudaFree(d_keys));
+            gpuErrchk(flamegpu::detail::cuda::cudaFree(d_vals));
         }
         d_keys_vals_storage_bytes = bytesCheck;
         gpuErrchk(cudaMalloc(&d_keys, d_keys_vals_storage_bytes));
@@ -216,7 +216,7 @@ MessageBucket::Data::Data(std::shared_ptr<const ModelData> model, const Data &ot
 MessageBucket::Data *MessageBucket::Data::clone(const std::shared_ptr<const ModelData> &newParent) {
     return new Data(newParent, *this);
 }
-std::unique_ptr<MessageSpecialisationHandler> MessageBucket::Data::getSpecialisationHander(CUDAMessage &owner) const {
+std::unique_ptr<MessageSpecialisationHandler> MessageBucket::Data::getSpecialisationHander(detail::CUDAMessage &owner) const {
     return std::unique_ptr<MessageSpecialisationHandler>(new CUDAModelHandler(owner));
 }
 std::type_index MessageBucket::Data::getType() const { return std::type_index(typeid(MessageBucket)); }
diff --git a/src/flamegpu/runtime/messaging/MessageSpatial2D.cu b/src/flamegpu/runtime/messaging/MessageSpatial2D.cu
index f404f0698..3b304a49a 100644
--- a/src/flamegpu/runtime/messaging/MessageSpatial2D.cu
+++ b/src/flamegpu/runtime/messaging/MessageSpatial2D.cu
@@ -12,13 +12,13 @@
 #include "flamegpu/runtime/messaging.h"
 #include "flamegpu/runtime/messaging/MessageSpatial2D/MessageSpatial2DHost.h"
 #include "flamegpu/runtime/messaging/MessageSpatial2D/MessageSpatial2DDevice.cuh"
-#include "flamegpu/gpu/CUDAMessage.h"
-#include "flamegpu/gpu/CUDAScatter.cuh"
+#include "flamegpu/simulation/detail/CUDAMessage.h"
+#include "flamegpu/simulation/detail/CUDAScatter.cuh"
 #include "flamegpu/util/nvtx.h"
-#include "flamegpu/util/detail/cuda.cuh"
+#include "flamegpu/detail/cuda.cuh"
 
 namespace flamegpu {
-MessageSpatial2D::CUDAModelHandler::CUDAModelHandler(CUDAMessage &a)
+MessageSpatial2D::CUDAModelHandler::CUDAModelHandler(detail::CUDAMessage &a)
     : MessageSpecialisationHandler()
     , sim_message(a) {
     flamegpu::util::nvtx::Range range{"MessageSpatial2D::CUDAModelHandler::CUDAModelHandler"};
@@ -55,7 +55,7 @@ __global__ void atomicHistogram2D(
     bin_sub_index[index] = bin_idx;
 }
 
-void MessageSpatial2D::CUDAModelHandler::init(CUDAScatter &, unsigned int, cudaStream_t stream) {
+void MessageSpatial2D::CUDAModelHandler::init(detail::CUDAScatter &, unsigned int, cudaStream_t stream) {
     allocateMetaDataDevicePtr(stream);
     // Set PBM to 0
     gpuErrchk(cudaMemsetAsync(hd_data.PBM, 0x00000000, (binCount + 1) * sizeof(unsigned int), stream));
@@ -76,25 +76,25 @@ void MessageSpatial2D::CUDAModelHandler::allocateMetaDataDevicePtr(cudaStream_t
 void MessageSpatial2D::CUDAModelHandler::freeMetaDataDevicePtr() {
     if (d_data != nullptr) {
         d_CUB_temp_storage_bytes = 0;
-        gpuErrchk(flamegpu::util::detail::cuda::cudaFree(d_CUB_temp_storage));
-        gpuErrchk(flamegpu::util::detail::cuda::cudaFree(d_histogram));
-        gpuErrchk(flamegpu::util::detail::cuda::cudaFree(hd_data.PBM));
-        gpuErrchk(flamegpu::util::detail::cuda::cudaFree(d_data));
+        gpuErrchk(flamegpu::detail::cuda::cudaFree(d_CUB_temp_storage));
+        gpuErrchk(flamegpu::detail::cuda::cudaFree(d_histogram));
+        gpuErrchk(flamegpu::detail::cuda::cudaFree(hd_data.PBM));
+        gpuErrchk(flamegpu::detail::cuda::cudaFree(d_data));
         d_CUB_temp_storage = nullptr;
         d_histogram = nullptr;
         hd_data.PBM = nullptr;
         d_data = nullptr;
         if (d_keys) {
             d_keys_vals_storage_bytes = 0;
-            gpuErrchk(flamegpu::util::detail::cuda::cudaFree(d_keys));
-            gpuErrchk(flamegpu::util::detail::cuda::cudaFree(d_vals));
+            gpuErrchk(flamegpu::detail::cuda::cudaFree(d_keys));
+            gpuErrchk(flamegpu::detail::cuda::cudaFree(d_vals));
             d_keys = nullptr;
             d_vals = nullptr;
         }
     }
 }
 
-void MessageSpatial2D::CUDAModelHandler::buildIndex(CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream) {
+void MessageSpatial2D::CUDAModelHandler::buildIndex(detail::CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream) {
     flamegpu::util::nvtx::Range range{"MessageSpatial2D::CUDAModelHandler::buildIndex"};
     const unsigned int MESSAGE_COUNT = this->sim_message.getMessageCount();
     resizeKeysVals(this->sim_message.getMaximumListSize());  // Resize based on allocated amount rather than message count
@@ -128,7 +128,7 @@ void MessageSpatial2D::CUDAModelHandler::resizeCubTemp(cudaStream_t stream) {
     gpuErrchk(cub::DeviceScan::ExclusiveSum(nullptr, bytesCheck, hd_data.PBM, d_histogram, binCount + 1, stream));
     if (bytesCheck > d_CUB_temp_storage_bytes) {
         if (d_CUB_temp_storage) {
-            gpuErrchk(flamegpu::util::detail::cuda::cudaFree(d_CUB_temp_storage));
+            gpuErrchk(flamegpu::detail::cuda::cudaFree(d_CUB_temp_storage));
         }
         d_CUB_temp_storage_bytes = bytesCheck;
         gpuErrchk(cudaMalloc(&d_CUB_temp_storage, d_CUB_temp_storage_bytes));
@@ -139,8 +139,8 @@ void MessageSpatial2D::CUDAModelHandler::resizeKeysVals(const unsigned int newSi
     size_t bytesCheck = newSize * sizeof(unsigned int);
     if (bytesCheck > d_keys_vals_storage_bytes) {
         if (d_keys) {
-            gpuErrchk(flamegpu::util::detail::cuda::cudaFree(d_keys));
-            gpuErrchk(flamegpu::util::detail::cuda::cudaFree(d_vals));
+            gpuErrchk(flamegpu::detail::cuda::cudaFree(d_keys));
+            gpuErrchk(flamegpu::detail::cuda::cudaFree(d_vals));
         }
         d_keys_vals_storage_bytes = bytesCheck;
         gpuErrchk(cudaMalloc(&d_keys, d_keys_vals_storage_bytes));
@@ -259,8 +259,8 @@ MessageSpatial2D::Data::Data(std::shared_ptr<const ModelData> model, const std::
     , maxX(NAN)
     , maxY(NAN) {
     // MessageSpatial2D has x/y variables by default
-    variables.emplace("x", Variable(std::array<typename type_decode<float>::type_t, 1>{}));
-    variables.emplace("y", Variable(std::array<typename type_decode<float>::type_t, 1>{}));
+    variables.emplace("x", Variable(std::array<typename detail::type_decode<float>::type_t, 1>{}));
+    variables.emplace("y", Variable(std::array<typename detail::type_decode<float>::type_t, 1>{}));
 }
 MessageSpatial2D::Data::Data(std::shared_ptr<const ModelData> model, const Data &other)
     : MessageBruteForce::Data(model, other)
@@ -288,7 +288,7 @@ MessageSpatial2D::Data::Data(std::shared_ptr<const ModelData> model, const Data
 MessageSpatial2D::Data *MessageSpatial2D::Data::clone(const std::shared_ptr<const ModelData> &newParent) {
     return new Data(newParent, *this);
 }
-std::unique_ptr<MessageSpecialisationHandler> MessageSpatial2D::Data::getSpecialisationHander(CUDAMessage &owner) const {
+std::unique_ptr<MessageSpecialisationHandler> MessageSpatial2D::Data::getSpecialisationHander(detail::CUDAMessage &owner) const {
     return std::unique_ptr<MessageSpecialisationHandler>(new CUDAModelHandler(owner));
 }
 std::type_index MessageSpatial2D::Data::getType() const { return std::type_index(typeid(MessageSpatial2D)); }
diff --git a/src/flamegpu/runtime/messaging/MessageSpatial3D.cu b/src/flamegpu/runtime/messaging/MessageSpatial3D.cu
index 56010e89b..8e4f396da 100644
--- a/src/flamegpu/runtime/messaging/MessageSpatial3D.cu
+++ b/src/flamegpu/runtime/messaging/MessageSpatial3D.cu
@@ -1,7 +1,7 @@
 #include "flamegpu/runtime/messaging/MessageSpatial3D/MessageSpatial3DHost.h"
 #include "flamegpu/runtime/messaging/MessageSpatial3D/MessageSpatial3DDevice.cuh"
-#include "flamegpu/util/detail/cuda.cuh"
-#include "flamegpu/gpu/CUDAScatter.cuh"
+#include "flamegpu/detail/cuda.cuh"
+#include "flamegpu/simulation/detail/CUDAScatter.cuh"
 #ifdef _MSC_VER
 #pragma warning(push, 1)
 #pragma warning(disable : 4706 4834)
@@ -13,7 +13,7 @@
 
 
 namespace flamegpu {
-MessageSpatial3D::CUDAModelHandler::CUDAModelHandler(CUDAMessage &a)
+MessageSpatial3D::CUDAModelHandler::CUDAModelHandler(detail::CUDAMessage &a)
   : MessageSpecialisationHandler()
   , sim_message(a) {
     flamegpu::util::nvtx::Range range{"Spatial3D::CUDAModelHandler"};
@@ -54,7 +54,7 @@ __global__ void atomicHistogram3D(
     bin_sub_index[index] = bin_idx;
 }
 
-void MessageSpatial3D::CUDAModelHandler::init(CUDAScatter &, unsigned int, cudaStream_t stream) {
+void MessageSpatial3D::CUDAModelHandler::init(detail::CUDAScatter &, unsigned int, cudaStream_t stream) {
     allocateMetaDataDevicePtr(stream);
     // Set PBM to 0
     gpuErrchk(cudaMemsetAsync(hd_data.PBM, 0x00000000, (binCount + 1) * sizeof(unsigned int), stream));
@@ -75,25 +75,25 @@ void MessageSpatial3D::CUDAModelHandler::allocateMetaDataDevicePtr(cudaStream_t
 void MessageSpatial3D::CUDAModelHandler::freeMetaDataDevicePtr() {
     if (d_data != nullptr) {
         d_CUB_temp_storage_bytes = 0;
-        gpuErrchk(flamegpu::util::detail::cuda::cudaFree(d_CUB_temp_storage));
-        gpuErrchk(flamegpu::util::detail::cuda::cudaFree(d_histogram));
-        gpuErrchk(flamegpu::util::detail::cuda::cudaFree(hd_data.PBM));
-        gpuErrchk(flamegpu::util::detail::cuda::cudaFree(d_data));
+        gpuErrchk(flamegpu::detail::cuda::cudaFree(d_CUB_temp_storage));
+        gpuErrchk(flamegpu::detail::cuda::cudaFree(d_histogram));
+        gpuErrchk(flamegpu::detail::cuda::cudaFree(hd_data.PBM));
+        gpuErrchk(flamegpu::detail::cuda::cudaFree(d_data));
         d_CUB_temp_storage = nullptr;
         d_histogram = nullptr;
         hd_data.PBM = nullptr;
         d_data = nullptr;
         if (d_keys) {
             d_keys_vals_storage_bytes = 0;
-            gpuErrchk(flamegpu::util::detail::cuda::cudaFree(d_keys));
-            gpuErrchk(flamegpu::util::detail::cuda::cudaFree(d_vals));
+            gpuErrchk(flamegpu::detail::cuda::cudaFree(d_keys));
+            gpuErrchk(flamegpu::detail::cuda::cudaFree(d_vals));
             d_keys = nullptr;
             d_vals = nullptr;
         }
     }
 }
 
-void MessageSpatial3D::CUDAModelHandler::buildIndex(CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream) {
+void MessageSpatial3D::CUDAModelHandler::buildIndex(detail::CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream) {
     flamegpu::util::nvtx::Range range{"MessageSpatial3D::CUDAModelHandler::buildIndex"};
     const unsigned int MESSAGE_COUNT = this->sim_message.getMessageCount();
     resizeKeysVals(this->sim_message.getMaximumListSize());  // Resize based on allocated amount rather than message count
@@ -128,7 +128,7 @@ void MessageSpatial3D::CUDAModelHandler::resizeCubTemp(cudaStream_t stream) {
     gpuErrchk(cub::DeviceScan::ExclusiveSum(nullptr, bytesCheck, hd_data.PBM, d_histogram, binCount + 1, stream));
     if (bytesCheck > d_CUB_temp_storage_bytes) {
         if (d_CUB_temp_storage) {
-            gpuErrchk(flamegpu::util::detail::cuda::cudaFree(d_CUB_temp_storage));
+            gpuErrchk(flamegpu::detail::cuda::cudaFree(d_CUB_temp_storage));
         }
         d_CUB_temp_storage_bytes = bytesCheck;
         gpuErrchk(cudaMalloc(&d_CUB_temp_storage, d_CUB_temp_storage_bytes));
@@ -139,8 +139,8 @@ void MessageSpatial3D::CUDAModelHandler::resizeKeysVals(const unsigned int newSi
     size_t bytesCheck = newSize * sizeof(unsigned int);
     if (bytesCheck > d_keys_vals_storage_bytes) {
         if (d_keys) {
-            gpuErrchk(flamegpu::util::detail::cuda::cudaFree(d_keys));
-            gpuErrchk(flamegpu::util::detail::cuda::cudaFree(d_vals));
+            gpuErrchk(flamegpu::detail::cuda::cudaFree(d_keys));
+            gpuErrchk(flamegpu::detail::cuda::cudaFree(d_vals));
         }
         d_keys_vals_storage_bytes = bytesCheck;
         gpuErrchk(cudaMalloc(&d_keys, d_keys_vals_storage_bytes));
@@ -236,7 +236,7 @@ MessageSpatial3D::Data::Data(std::shared_ptr<const ModelData> model, const std::
     , minZ(NAN)
     , maxZ(NAN) {
     // MessageSpatial3D has x/y/z variables by default (x/y are inherited)
-    variables.emplace("z", Variable(std::array<typename type_decode<float>::type_t, 1>{}));
+    variables.emplace("z", Variable(std::array<typename detail::type_decode<float>::type_t, 1>{}));
 }
 MessageSpatial3D::Data::Data(std::shared_ptr<const ModelData> model, const Data &other)
     : MessageSpatial2D::Data(model, other)
@@ -252,7 +252,7 @@ MessageSpatial3D::Data::Data(std::shared_ptr<const ModelData> model, const Data
 MessageSpatial3D::Data *MessageSpatial3D::Data::clone(const std::shared_ptr<const ModelData> &newParent) {
     return new Data(newParent, *this);
 }
-std::unique_ptr<MessageSpecialisationHandler> MessageSpatial3D::Data::getSpecialisationHander(CUDAMessage &owner) const {
+std::unique_ptr<MessageSpecialisationHandler> MessageSpatial3D::Data::getSpecialisationHander(detail::CUDAMessage &owner) const {
     return std::unique_ptr<MessageSpecialisationHandler>(new CUDAModelHandler(owner));
 }
 std::type_index MessageSpatial3D::Data::getType() const { return std::type_index(typeid(MessageSpatial3D)); }
diff --git a/src/flamegpu/runtime/utility/HostRandom.cu b/src/flamegpu/runtime/random/HostRandom.cu
similarity index 80%
rename from src/flamegpu/runtime/utility/HostRandom.cu
rename to src/flamegpu/runtime/random/HostRandom.cu
index 78f448419..14f8262e4 100644
--- a/src/flamegpu/runtime/utility/HostRandom.cu
+++ b/src/flamegpu/runtime/random/HostRandom.cu
@@ -1,4 +1,4 @@
-#include "flamegpu/runtime/utility/HostRandom.cuh"
+#include "flamegpu/runtime/random/HostRandom.cuh"
 
 namespace flamegpu {
 
diff --git a/src/flamegpu/pop/AgentInstance.cpp b/src/flamegpu/simulation/AgentInstance.cpp
similarity index 83%
rename from src/flamegpu/pop/AgentInstance.cpp
rename to src/flamegpu/simulation/AgentInstance.cpp
index 63b68d5c0..676b6a4ea 100644
--- a/src/flamegpu/pop/AgentInstance.cpp
+++ b/src/flamegpu/simulation/AgentInstance.cpp
@@ -1,8 +1,8 @@
-#include "flamegpu/pop/AgentInstance.h"
+#include "flamegpu/runtime/agent/AgentInstance.h"
 
 #include "flamegpu/model/AgentDescription.h"
 #include "flamegpu/model/AgentData.h"
-#include "flamegpu/pop/AgentVector.h"
+#include "flamegpu/simulation/AgentVector.h"
 
 namespace flamegpu {
 
@@ -10,7 +10,7 @@ AgentInstance::AgentInstance(const CAgentDescription& agent_desc)
     : _agent(agent_desc.agent->clone()) {
     // Fill data map with default values
     for (const auto& v : _agent->variables) {
-        _data.emplace(v.first, util::Any(v.second.default_value, v.second.type_size * v.second.elements, v.second.type, v.second.elements));
+        _data.emplace(v.first, detail::Any(v.second.default_value, v.second.type_size * v.second.elements, v.second.type, v.second.elements));
     }
 }
 
@@ -33,7 +33,7 @@ AgentInstance::AgentInstance(const AgentVector::CAgent& other)
     for (const auto& v : _agent->variables) {
         const auto &it = other_data->at(v.first);
         const auto variable_size = v.second.elements * v.second.type_size;
-        _data.emplace(v.first, util::Any(static_cast<const char*>(it->getReadOnlyDataPtr()) + other.index * variable_size,
+        _data.emplace(v.first, detail::Any(static_cast<const char*>(it->getReadOnlyDataPtr()) + other.index * variable_size,
             variable_size, it->getType(), it->getElements()));
     }
 }
@@ -67,7 +67,7 @@ AgentInstance& AgentInstance::operator=(const AgentVector::CAgent& other) {
     for (const auto& v : _agent->variables) {
         const auto& it = other_data->at(v.first);
         const auto variable_size = v.second.elements * v.second.type_size;
-        _data.emplace(v.first, util::Any(static_cast<const char*>(it->getReadOnlyDataPtr()) + other.index * variable_size,
+        _data.emplace(v.first, detail::Any(static_cast<const char*>(it->getReadOnlyDataPtr()) + other.index * variable_size,
             variable_size, it->getType(), it->getElements()));
     }
     return *this;
diff --git a/src/flamegpu/sim/AgentLoggingConfig.cu b/src/flamegpu/simulation/AgentLoggingConfig.cu
similarity index 97%
rename from src/flamegpu/sim/AgentLoggingConfig.cu
rename to src/flamegpu/simulation/AgentLoggingConfig.cu
index d1afb6e9e..20d8701f5 100644
--- a/src/flamegpu/sim/AgentLoggingConfig.cu
+++ b/src/flamegpu/simulation/AgentLoggingConfig.cu
@@ -1,6 +1,6 @@
 #include <utility>
 
-#include "flamegpu/sim/AgentLoggingConfig.h"
+#include "flamegpu/simulation/AgentLoggingConfig.h"
 #include "flamegpu/model/AgentData.h"
 
 namespace flamegpu {
diff --git a/src/flamegpu/pop/AgentVector.cpp b/src/flamegpu/simulation/AgentVector.cpp
similarity index 99%
rename from src/flamegpu/pop/AgentVector.cpp
rename to src/flamegpu/simulation/AgentVector.cpp
index d5adf772e..e429fb808 100644
--- a/src/flamegpu/pop/AgentVector.cpp
+++ b/src/flamegpu/simulation/AgentVector.cpp
@@ -1,9 +1,9 @@
-#include "flamegpu/pop/AgentVector.h"
+#include "flamegpu/simulation/AgentVector.h"
 
 #include <limits>
 
 #include "flamegpu/model/AgentDescription.h"
-#include "flamegpu/pop/AgentVector_Agent.h"
+#include "flamegpu/simulation/AgentVector_Agent.h"
 
 // @todo - this shouldn't be required anymore?
 #ifdef max
diff --git a/src/flamegpu/pop/AgentVector_Agent.cpp b/src/flamegpu/simulation/AgentVector_Agent.cpp
similarity index 97%
rename from src/flamegpu/pop/AgentVector_Agent.cpp
rename to src/flamegpu/simulation/AgentVector_Agent.cpp
index 691640aec..8fe132709 100644
--- a/src/flamegpu/pop/AgentVector_Agent.cpp
+++ b/src/flamegpu/simulation/AgentVector_Agent.cpp
@@ -1,4 +1,4 @@
-#include "flamegpu/pop/AgentVector_Agent.h"
+#include "flamegpu/simulation/AgentVector_Agent.h"
 
 namespace flamegpu {
 
diff --git a/src/flamegpu/gpu/CUDAEnsemble.cu b/src/flamegpu/simulation/CUDAEnsemble.cu
similarity index 94%
rename from src/flamegpu/gpu/CUDAEnsemble.cu
rename to src/flamegpu/simulation/CUDAEnsemble.cu
index f6f9487c4..b5e196ccb 100644
--- a/src/flamegpu/gpu/CUDAEnsemble.cu
+++ b/src/flamegpu/simulation/CUDAEnsemble.cu
@@ -1,4 +1,4 @@
-#include "flamegpu/gpu/CUDAEnsemble.h"
+#include "flamegpu/simulation/CUDAEnsemble.h"
 
 #include <algorithm>
 #include <cstdlib>
@@ -13,16 +13,16 @@
 
 #include "flamegpu/version.h"
 #include "flamegpu/model/ModelDescription.h"
-#include "flamegpu/sim/RunPlanVector.h"
-#include "flamegpu/util/detail/compute_capability.cuh"
-#include "flamegpu/util/detail/SteadyClockTimer.h"
-#include "flamegpu/gpu/CUDASimulation.h"
+#include "flamegpu/simulation/RunPlanVector.h"
+#include "flamegpu/detail/compute_capability.cuh"
+#include "flamegpu/detail/SteadyClockTimer.h"
+#include "flamegpu/simulation/CUDASimulation.h"
 #include "flamegpu/io/StateWriterFactory.h"
-#include "flamegpu/sim/LoggingConfig.h"
-#include "flamegpu/sim/SimRunner.h"
-#include "flamegpu/sim/LogFrame.h"
-#include "flamegpu/sim/SimLogger.h"
-#include "flamegpu/util/detail/cuda.cuh"
+#include "flamegpu/simulation/LoggingConfig.h"
+#include "flamegpu/simulation/detail/SimRunner.h"
+#include "flamegpu/simulation/LogFrame.h"
+#include "flamegpu/simulation/detail/SimLogger.h"
+#include "flamegpu/detail/cuda.cuh"
 #include "flamegpu/io/Telemetry.h"
 
 namespace flamegpu {
@@ -147,13 +147,13 @@ unsigned int CUDAEnsemble::simulate(const RunPlanVector &plans) {
     }
     // Check that each device is capable, and init cuda context
     for (auto d = devices.begin(); d != devices.end(); ++d) {
-        if (!util::detail::compute_capability::checkComputeCapability(*d)) {
+        if (!detail::compute_capability::checkComputeCapability(*d)) {
             fprintf(stderr, "FLAMEGPU2 has not been built with an appropriate compute capability for device %d, this device will not be used.\n", *d);
             d = devices.erase(d);
             --d;
         } else {
             gpuErrchk(cudaSetDevice(*d));
-            gpuErrchk(flamegpu::util::detail::cuda::cudaFree(nullptr));
+            gpuErrchk(flamegpu::detail::cuda::cudaFree(nullptr));
         }
     }
     // Return to device 0 (or check original device first?)
@@ -163,10 +163,10 @@ unsigned int CUDAEnsemble::simulate(const RunPlanVector &plans) {
     std::atomic<unsigned int> err_ct = {0};
     std::atomic<unsigned int> next_run = {0};
     const unsigned int TOTAL_RUNNERS = static_cast<unsigned int>(devices.size()) * config.concurrent_runs;
-    SimRunner *runners = static_cast<SimRunner *>(malloc(sizeof(SimRunner) * TOTAL_RUNNERS));
+    detail::SimRunner *runners = static_cast<detail::SimRunner *>(malloc(sizeof(detail::SimRunner) * TOTAL_RUNNERS));
 
     // Log Time (We can't use CUDA events here, due to device resets)
-    auto ensemble_timer = util::detail::SteadyClockTimer();
+    auto ensemble_timer = detail::SteadyClockTimer();
     ensemble_timer.start();
     // Reset the elapsed time.
     ensemble_elapsed_time = 0.;
@@ -175,14 +175,14 @@ unsigned int CUDAEnsemble::simulate(const RunPlanVector &plans) {
     std::queue<unsigned int> log_export_queue;
     std::mutex log_export_queue_mutex;
     std::condition_variable log_export_queue_cdn;
-    SimRunner::ErrorDetail fast_err_detail = {};
+    detail::SimRunner::ErrorDetail fast_err_detail = {};
 
     // Init with placement new
     {
         unsigned int i = 0;
         for (auto &d : devices) {
             for (unsigned int j = 0; j < config.concurrent_runs; ++j) {
-                new (&runners[i++]) SimRunner(model, err_ct, next_run, plans,
+                new (&runners[i++]) detail::SimRunner(model, err_ct, next_run, plans,
                     step_log_config, exit_log_config,
                     d, j,
                     config.verbosity, config.error_level == EnsembleConfig::Fast,
@@ -192,9 +192,9 @@ unsigned int CUDAEnsemble::simulate(const RunPlanVector &plans) {
     }
 
     // Init log worker
-    SimLogger *log_worker = nullptr;
+    detail::SimLogger *log_worker = nullptr;
     if (!config.out_directory.empty()) {
-        log_worker = new SimLogger(run_logs, plans, config.out_directory, config.out_format, log_export_queue, log_export_queue_mutex, log_export_queue_cdn,
+        log_worker = new detail::SimLogger(run_logs, plans, config.out_directory, config.out_format, log_export_queue, log_export_queue_mutex, log_export_queue_cdn,
         step_log_config.get(), exit_log_config.get(), step_log_config && step_log_config->log_timing, exit_log_config && exit_log_config->log_timing);
     }
 
@@ -233,7 +233,7 @@ unsigned int CUDAEnsemble::simulate(const RunPlanVector &plans) {
     if (config.telemetry) {
         // Generate some payload items
         std::map<std::string, std::string> payload_items;
-        payload_items["GPUDevices"] = flamegpu::util::detail::compute_capability::getDeviceNames(config.devices);
+        payload_items["GPUDevices"] = flamegpu::detail::compute_capability::getDeviceNames(config.devices);
         payload_items["SimTime(s)"] = std::to_string(ensemble_elapsed_time);
         #if defined(__CUDACC_VER_MAJOR__) && defined(__CUDACC_VER_MINOR__) && defined(__CUDACC_VER_PATCH__)
             payload_items["NVCCVersion"] = std::to_string(__CUDACC_VER_MAJOR__) + "." + std::to_string(__CUDACC_VER_MINOR__) + "." + std::to_string(__CUDACC_VER_BUILD__);
diff --git a/src/flamegpu/gpu/CUDASimulation.cu b/src/flamegpu/simulation/CUDASimulation.cu
similarity index 92%
rename from src/flamegpu/gpu/CUDASimulation.cu
rename to src/flamegpu/simulation/CUDASimulation.cu
index e0fc6edcb..98a00c807 100644
--- a/src/flamegpu/gpu/CUDASimulation.cu
+++ b/src/flamegpu/simulation/CUDASimulation.cu
@@ -1,32 +1,32 @@
-#include "flamegpu/gpu/CUDASimulation.h"
+#include "flamegpu/simulation/CUDASimulation.h"
 
 
 #include <algorithm>
 #include <string>
 #include <map>
 
-#include "flamegpu/util/detail/curand.cuh"
+#include "flamegpu/detail/curand.cuh"
 #include "flamegpu/model/AgentFunctionData.cuh"
 #include "flamegpu/model/LayerData.h"
 #include "flamegpu/model/AgentDescription.h"
 #include "flamegpu/model/SubModelData.h"
 #include "flamegpu/model/SubAgentData.h"
 #include "flamegpu/runtime/HostAPI.h"
-#include "flamegpu/gpu/CUDAScanCompaction.h"
+#include "flamegpu/simulation/detail/CUDAScanCompaction.h"
 #include "flamegpu/util/nvtx.h"
-#include "flamegpu/util/detail/compute_capability.cuh"
-#include "flamegpu/util/detail/SignalHandlers.h"
-#include "flamegpu/util/detail/wddm.cuh"
-#include "flamegpu/util/detail/SteadyClockTimer.h"
-#include "flamegpu/util/detail/CUDAEventTimer.cuh"
+#include "flamegpu/detail/compute_capability.cuh"
+#include "flamegpu/detail/SignalHandlers.h"
+#include "flamegpu/detail/wddm.cuh"
+#include "flamegpu/detail/SteadyClockTimer.h"
+#include "flamegpu/detail/CUDAEventTimer.cuh"
 #include "flamegpu/runtime/detail/curve/curve_rtc.cuh"
 #include "flamegpu/runtime/HostFunctionCallback.h"
 #include "flamegpu/runtime/messaging.h"
-#include "flamegpu/gpu/CUDAAgent.h"
-#include "flamegpu/gpu/CUDAMessage.h"
-#include "flamegpu/sim/LoggingConfig.h"
-#include "flamegpu/sim/LogFrame.h"
-#include "flamegpu/sim/RunPlan.h"
+#include "flamegpu/simulation/detail/CUDAAgent.h"
+#include "flamegpu/simulation/detail/CUDAMessage.h"
+#include "flamegpu/simulation/LoggingConfig.h"
+#include "flamegpu/simulation/LogFrame.h"
+#include "flamegpu/simulation/RunPlan.h"
 #include "flamegpu/version.h"
 #include "flamegpu/model/AgentFunctionDescription.h"
 #include "flamegpu/io/Telemetry.h"
@@ -40,11 +40,11 @@ namespace {
     // file-scope only variable used to cache the driver mode
     bool deviceUsingWDDM = false;
     // Inlined method in the anonymous namespace to create a new timer, subject to the driver model.
-    std::unique_ptr<util::detail::Timer> getDriverAppropriateTimer(bool is_ensemble) {
+    std::unique_ptr<detail::Timer> getDriverAppropriateTimer(bool is_ensemble) {
         if (!deviceUsingWDDM && !is_ensemble) {
-            return std::unique_ptr<util::detail::Timer>(new util::detail::CUDAEventTimer());
+            return std::unique_ptr<detail::Timer>(new detail::CUDAEventTimer());
         } else {
-            return std::unique_ptr<util::detail::Timer>(new util::detail::SteadyClockTimer());
+            return std::unique_ptr<detail::Timer>(new detail::SteadyClockTimer());
         }
     }
 }  // anonymous namespace
@@ -72,21 +72,21 @@ CUDASimulation::CUDASimulation(const std::shared_ptr<const ModelData> &_model)
     , isPureRTC(detectPureRTC(model)) {
     initOffsetsAndMap();
     // Register the signal handler.
-    util::detail::SignalHandlers::registerSignalHandlers();
+    detail::SignalHandlers::registerSignalHandlers();
 
     // populate the CUDA agent map
     const auto &am = model->agents;
     // create new cuda agent and add to the map
     for (auto it = am.cbegin(); it != am.cend(); ++it) {
         // insert into map using value_type and store a reference to the map pair
-        agent_map.emplace(it->first, std::make_unique<CUDAAgent>(*it->second, *this)).first;
+        agent_map.emplace(it->first, std::make_unique<detail::CUDAAgent>(*it->second, *this)).first;
     }
 
     // populate the CUDA message map
     const auto &mm = model->messages;
     // create new cuda message and add to the map
     for (auto it_m = mm.cbegin(); it_m != mm.cend(); ++it_m) {
-        message_map.emplace(it_m->first, std::make_unique<CUDAMessage>(*it_m->second, *this));
+        message_map.emplace(it_m->first, std::make_unique<detail::CUDAMessage>(*it_m->second, *this));
     }
 
     // populate the CUDA submodel map
@@ -147,11 +147,11 @@ CUDASimulation::CUDASimulation(const std::shared_ptr<SubModelData> &submodel_des
             if (!masterAgentDesc) {
                 THROW exception::InvalidParent("Master agent description has expired, in CUDASimulation SubModel constructor.\n");
             }
-            std::unique_ptr<CUDAAgent> &masterAgent = master_model->agent_map.at(masterAgentDesc->name);
-            agent_map.emplace(it->first, std::make_unique<CUDAAgent>(*it->second, *this, masterAgent, mapping));
+            std::unique_ptr<detail::CUDAAgent> &masterAgent = master_model->agent_map.at(masterAgentDesc->name);
+            agent_map.emplace(it->first, std::make_unique<detail::CUDAAgent>(*it->second, *this, masterAgent, mapping));
         } else {
             // Agent is not mapped, create regular agent
-            agent_map.emplace(it->first, std::make_unique<CUDAAgent>(*it->second, *this)).first;
+            agent_map.emplace(it->first, std::make_unique<detail::CUDAAgent>(*it->second, *this)).first;
         }
     }  // insert into map using value_type
 
@@ -159,7 +159,7 @@ CUDASimulation::CUDASimulation(const std::shared_ptr<SubModelData> &submodel_des
     const auto &mm = model->messages;
     // create new cuda message and add to the map
     for (auto it_m = mm.cbegin(); it_m != mm.cend(); ++it_m) {
-        message_map.emplace(it_m->first, std::make_unique<CUDAMessage>(*it_m->second, *this));
+        message_map.emplace(it_m->first, std::make_unique<detail::CUDAMessage>(*it_m->second, *this));
     }
 
     // populate the CUDA submodel map
@@ -215,7 +215,7 @@ CUDASimulation::~CUDASimulation() {
 
 void CUDASimulation::initFunctions() {
     flamegpu::util::nvtx::Range range{"CUDASimulation::initFunctions"};
-    std::unique_ptr<util::detail::Timer> initFunctionsTimer(new util::detail::SteadyClockTimer());
+    std::unique_ptr<detail::Timer> initFunctionsTimer(new detail::SteadyClockTimer());
     initFunctionsTimer->start();
 
     // Execute normal init functions
@@ -245,7 +245,7 @@ void CUDASimulation::initFunctions() {
 
 void CUDASimulation::exitFunctions() {
     flamegpu::util::nvtx::Range range{"CUDASimulation::exitFunctions"};
-    std::unique_ptr<util::detail::Timer> exitFunctionsTimer(new util::detail::SteadyClockTimer());
+    std::unique_ptr<detail::Timer> exitFunctionsTimer(new detail::SteadyClockTimer());
     exitFunctionsTimer->start();
 
     // Execute exit functions
@@ -403,7 +403,7 @@ void CUDASimulation::determineAgentsToSort() {
 
 void CUDASimulation::spatialSortAgent_async(const std::string& funcName, const std::string& agentName, const std::string& state, const int mode, cudaStream_t stream, unsigned int streamId) {
     // Fetch the appropriate message name
-    CUDAAgent& cuda_agent = getCUDAAgent(agentName);
+    detail::CUDAAgent& cuda_agent = getCUDAAgent(agentName);
 
     const unsigned int state_list_size = cuda_agent.getStateSize(state);
     // Can't sort no agents
@@ -519,7 +519,7 @@ bool CUDASimulation::step() {
     initialiseSingletons();
 
     // Time the individual step, using a CUDAEventTimer if possible, else a steadyClockTimer.
-    std::unique_ptr<util::detail::Timer> stepTimer = getDriverAppropriateTimer(getCUDAConfig().is_ensemble);
+    std::unique_ptr<detail::Timer> stepTimer = getDriverAppropriateTimer(getCUDAConfig().is_ensemble);
     stepTimer->start();
 
     // Init any unset agent IDs
@@ -628,20 +628,20 @@ void CUDASimulation::stepLayer(const std::shared_ptr<LayerData>& layer, const un
         if ((func_des->condition) || (!func_des->rtc_func_condition_name.empty())) {
             auto func_agent = func_des->parent.lock();
             flamegpu::util::nvtx::Range condition_range{std::string("condition map " + func_agent->name + "::" + func_des->name).c_str()};
-            const CUDAAgent& cuda_agent = getCUDAAgent(func_agent->name);
+            const detail::CUDAAgent& cuda_agent = getCUDAAgent(func_agent->name);
 
             const unsigned int state_list_size = cuda_agent.getStateSize(func_des->initial_state);
             if (state_list_size == 0) {
                 ++streamIdx;
                 continue;
             }
-            singletons->scatter.Scan().resize(state_list_size, CUDAScanCompaction::AGENT_DEATH, streamIdx);
+            singletons->scatter.Scan().resize(state_list_size, detail::CUDAScanCompaction::AGENT_DEATH, streamIdx);
 
             // Configure runtime access of the functions variables within the FLAME_API object
             cuda_agent.mapRuntimeVariables(*func_des, instance_id);
 
             // Zero the scan flag that will be written to
-            singletons->scatter.Scan().zero_async(CUDAScanCompaction::AGENT_DEATH, getStream(streamIdx), streamIdx);
+            singletons->scatter.Scan().zero_async(detail::CUDAScanCompaction::AGENT_DEATH, getStream(streamIdx), streamIdx);
             // No sync, this occurs in same stream as dependent kernel launch
 
             // Push function's RTC cache to device if using RTC
@@ -665,7 +665,7 @@ void CUDASimulation::stepLayer(const std::shared_ptr<LayerData>& layer, const un
     // If any condition kernel needs to be executed, do so, by checking the number of threads from before.
     if (totalThreads > 0) {
         // Ensure RandomManager is the correct size to accommodate all threads to be launched
-        util::detail::curandState *d_rng = singletons->rng.resize(totalThreads, getStream(0));
+        detail::curandState *d_rng = singletons->rng.resize(totalThreads, getStream(0));
         // Track which stream to use for concurrency
         streamIdx = 0;
         // Sum the total number of threads being launched in the layer, for rng offsetting.
@@ -681,7 +681,7 @@ void CUDASimulation::stepLayer(const std::shared_ptr<LayerData>& layer, const un
                 std::string agent_name = func_agent->name;
                 std::string func_name = func_des->name;
 
-                const CUDAAgent& cuda_agent = getCUDAAgent(agent_name);
+                const detail::CUDAAgent& cuda_agent = getCUDAAgent(agent_name);
 
                 const unsigned int state_list_size = cuda_agent.getStateSize(func_des->initial_state);
                 if (state_list_size == 0) {
@@ -694,8 +694,8 @@ void CUDASimulation::stepLayer(const std::shared_ptr<LayerData>& layer, const un
                 int gridSize = 0;  // The actual grid size needed, based on input size
 
                 //  Agent function condition kernel wrapper args
-                util::detail::curandState *t_rng = d_rng + totalThreads;
-                unsigned int *scanFlag_agentDeath = this->singletons->scatter.Scan().Config(CUDAScanCompaction::Type::AGENT_DEATH, streamIdx).d_ptrs.scan_flag;
+                detail::curandState *t_rng = d_rng + totalThreads;
+                unsigned int *scanFlag_agentDeath = this->singletons->scatter.Scan().Config(detail::CUDAScanCompaction::Type::AGENT_DEATH, streamIdx).d_ptrs.scan_flag;
 #if !defined(FLAMEGPU_SEATBELTS) || FLAMEGPU_SEATBELTS
                 auto *error_buffer = this->singletons->exception.getDevicePtr(streamIdx, this->getStream(streamIdx));
 #endif
@@ -757,7 +757,7 @@ void CUDASimulation::stepLayer(const std::shared_ptr<LayerData>& layer, const un
                 THROW exception::InvalidAgentFunc("Agent function condition refers to expired agent.");
             }
             flamegpu::util::nvtx::Range unmap_range{std::string("condition unmap " + func_agent->name + "::" + func_des->name).c_str()};
-            CUDAAgent& cuda_agent = getCUDAAgent(func_agent->name);
+            detail::CUDAAgent& cuda_agent = getCUDAAgent(func_agent->name);
 
             // Skip if no agents in the input state
             const unsigned int state_list_size = cuda_agent.getStateSize(func_des->initial_state);
@@ -788,19 +788,19 @@ void CUDASimulation::stepLayer(const std::shared_ptr<LayerData>& layer, const un
         }
         flamegpu::util::nvtx::Range map_range{std::string("map" + func_agent->name + "::" + func_des->name).c_str()};
 
-        const CUDAAgent& cuda_agent = getCUDAAgent(func_agent->name);
+        const detail::CUDAAgent& cuda_agent = getCUDAAgent(func_agent->name);
         const unsigned int state_list_size = cuda_agent.getStateSize(func_des->initial_state);
         if (state_list_size == 0) {
             ++streamIdx;
             continue;
         }
         // Resize death flag array if necessary
-        singletons->scatter.Scan().resize(state_list_size, CUDAScanCompaction::AGENT_DEATH, streamIdx);
+        singletons->scatter.Scan().resize(state_list_size, detail::CUDAScanCompaction::AGENT_DEATH, streamIdx);
 
         // check if a function has an input message
         if (auto im = func_des->message_input.lock()) {
             std::string inpMessage_name = im->name;
-            CUDAMessage& cuda_message = getCUDAMessage(inpMessage_name);
+            detail::CUDAMessage& cuda_message = getCUDAMessage(inpMessage_name);
             // Construct PBM here if required!!
             cuda_message.buildIndex(this->singletons->scatter, streamIdx, this->getStream(streamIdx));  // This is synchronous.
             // Map variables after, as index building can swap arrays
@@ -810,15 +810,15 @@ void CUDASimulation::stepLayer(const std::shared_ptr<LayerData>& layer, const un
         // check if a function has an output message
         if (auto om = func_des->message_output.lock()) {
             std::string outpMessage_name = om->name;
-            CUDAMessage& cuda_message = getCUDAMessage(outpMessage_name);
+            detail::CUDAMessage& cuda_message = getCUDAMessage(outpMessage_name);
             // Resize message list if required
             const unsigned int existingMessages = cuda_message.getTruncateMessageListFlag() ? 0 : cuda_message.getMessageCount();
             cuda_message.resize(existingMessages + state_list_size, this->singletons->scatter, getStream(streamIdx), streamIdx, existingMessages);  // This could have it's internal syncs delayed
             cuda_message.mapWriteRuntimeVariables(*func_des, cuda_agent, state_list_size, getStream(streamIdx));
-            singletons->scatter.Scan().resize(state_list_size, CUDAScanCompaction::MESSAGE_OUTPUT, streamIdx);
+            singletons->scatter.Scan().resize(state_list_size, detail::CUDAScanCompaction::MESSAGE_OUTPUT, streamIdx);
             // Zero the scan flag that will be written to
             if (func_des->message_output_optional)
-                singletons->scatter.Scan().zero_async(CUDAScanCompaction::MESSAGE_OUTPUT, getStream(streamIdx), streamIdx);
+                singletons->scatter.Scan().zero_async(detail::CUDAScanCompaction::MESSAGE_OUTPUT, getStream(streamIdx), streamIdx);
                 // No Sync, any subsequent use should be in same stream
         }
 
@@ -826,7 +826,7 @@ void CUDASimulation::stepLayer(const std::shared_ptr<LayerData>& layer, const un
         if (auto oa = func_des->agent_output.lock()) {
             // This will act as a reserve word
             // which is added to variable hashes for agent creation on device
-            CUDAAgent& output_agent = getCUDAAgent(oa->name);
+            detail::CUDAAgent& output_agent = getCUDAAgent(oa->name);
 
             // Map vars with curve (this allocates/requests enough new buffer space if an existing version is not available/suitable)
             output_agent.mapNewRuntimeVariables_async(cuda_agent, *func_des, state_list_size, this->singletons->scatter, instance_id, getStream(streamIdx), streamIdx);
@@ -838,7 +838,7 @@ void CUDASimulation::stepLayer(const std::shared_ptr<LayerData>& layer, const un
 
         // Zero the scan flag that will be written to
         if (func_des->has_agent_death) {
-            singletons->scatter.Scan().zero_async(CUDAScanCompaction::AGENT_DEATH, getStream(streamIdx), streamIdx);
+            singletons->scatter.Scan().zero_async(detail::CUDAScanCompaction::AGENT_DEATH, getStream(streamIdx), streamIdx);
             // No Sync, any subsequent use should be in same stream
         }
 
@@ -863,7 +863,7 @@ void CUDASimulation::stepLayer(const std::shared_ptr<LayerData>& layer, const un
     // If any kernel needs to be executed, do so, by checking the number of threads from before.
     if (totalThreads > 0) {
         // Ensure RandomManager is the correct size to accommodate all threads to be launched
-        util::detail::curandState *d_rng = singletons->rng.resize(totalThreads, getStream(0));
+        detail::curandState *d_rng = singletons->rng.resize(totalThreads, getStream(0));
         // Total threads is now used to provide kernel launches an offset to thread-safe thread-index
         totalThreads = 0;
         streamIdx = 0;
@@ -884,7 +884,7 @@ void CUDASimulation::stepLayer(const std::shared_ptr<LayerData>& layer, const un
             // check if a function has an input message
             if (auto im = func_des->message_input.lock()) {
                 std::string inpMessage_name = im->name;
-                const CUDAMessage& cuda_message = getCUDAMessage(inpMessage_name);
+                const detail::CUDAMessage& cuda_message = getCUDAMessage(inpMessage_name);
 
                 d_in_messagelist_metadata = cuda_message.getMetaDataDevicePtr();
             }
@@ -892,18 +892,18 @@ void CUDASimulation::stepLayer(const std::shared_ptr<LayerData>& layer, const un
             // check if a function has an output message
             if (auto om = func_des->message_output.lock()) {
                 std::string outpMessage_name = om->name;
-                const CUDAMessage& cuda_message = getCUDAMessage(outpMessage_name);
+                const detail::CUDAMessage& cuda_message = getCUDAMessage(outpMessage_name);
 
                 d_out_messagelist_metadata = cuda_message.getMetaDataDevicePtr();
             }
 
             // check if a function has agent output
             if (auto oa = func_des->agent_output.lock()) {
-                CUDAAgent& output_agent = getCUDAAgent(oa->name);
+                detail::CUDAAgent& output_agent = getCUDAAgent(oa->name);
                 d_agentOut_nextID = output_agent.getDeviceNextID();
             }
 
-            const CUDAAgent& cuda_agent = getCUDAAgent(agent_name);
+            const detail::CUDAAgent& cuda_agent = getCUDAAgent(agent_name);
 
             const unsigned int state_list_size = cuda_agent.getStateSize(func_des->initial_state);
             if (state_list_size == 0) {
@@ -916,10 +916,10 @@ void CUDASimulation::stepLayer(const std::shared_ptr<LayerData>& layer, const un
             int gridSize = 0;  // The actual grid size needed, based on input size
 
             // Agent function kernel wrapper args
-            util::detail::curandState *t_rng = d_rng + totalThreads;
-            unsigned int *scanFlag_agentDeath = func_des->has_agent_death ? this->singletons->scatter.Scan().Config(CUDAScanCompaction::Type::AGENT_DEATH, streamIdx).d_ptrs.scan_flag : nullptr;
-            unsigned int *scanFlag_messageOutput = this->singletons->scatter.Scan().Config(CUDAScanCompaction::Type::MESSAGE_OUTPUT, streamIdx).d_ptrs.scan_flag;
-            unsigned int *scanFlag_agentOutput = this->singletons->scatter.Scan().Config(CUDAScanCompaction::Type::AGENT_OUTPUT, streamIdx).d_ptrs.scan_flag;
+            detail::curandState *t_rng = d_rng + totalThreads;
+            unsigned int *scanFlag_agentDeath = func_des->has_agent_death ? this->singletons->scatter.Scan().Config(detail::CUDAScanCompaction::Type::AGENT_DEATH, streamIdx).d_ptrs.scan_flag : nullptr;
+            unsigned int *scanFlag_messageOutput = this->singletons->scatter.Scan().Config(detail::CUDAScanCompaction::Type::MESSAGE_OUTPUT, streamIdx).d_ptrs.scan_flag;
+            unsigned int *scanFlag_agentOutput = this->singletons->scatter.Scan().Config(detail::CUDAScanCompaction::Type::AGENT_OUTPUT, streamIdx).d_ptrs.scan_flag;
     #if !defined(FLAMEGPU_SEATBELTS) || FLAMEGPU_SEATBELTS
             auto *error_buffer = this->singletons->exception.getDevicePtr(streamIdx, this->getStream(streamIdx));
     #endif
@@ -986,7 +986,7 @@ void CUDASimulation::stepLayer(const std::shared_ptr<LayerData>& layer, const un
             THROW exception::InvalidAgentFunc("Agent function refers to expired agent.");
         }
         flamegpu::util::nvtx::Range unmap_range{std::string("unmap" + func_agent->name + "::" + func_des->name).c_str()};
-        CUDAAgent& cuda_agent = getCUDAAgent(func_agent->name);
+        detail::CUDAAgent& cuda_agent = getCUDAAgent(func_agent->name);
 
         const unsigned int state_list_size = cuda_agent.getStateSize(func_des->initial_state);
         // If agent function wasn't executed, these are redundant
@@ -994,7 +994,7 @@ void CUDASimulation::stepLayer(const std::shared_ptr<LayerData>& layer, const un
             // check if a function has an output message
             if (auto om = func_des->message_output.lock()) {
                 std::string outpMessage_name = om->name;
-                CUDAMessage& cuda_message = getCUDAMessage(outpMessage_name);
+                detail::CUDAMessage& cuda_message = getCUDAMessage(outpMessage_name);
                 cuda_message.swap(func_des->message_output_optional, state_list_size, this->singletons->scatter, getStream(streamIdx), streamIdx);
                 cuda_message.clearTruncateMessageListFlag();
                 cuda_message.setPBMConstructionRequiredFlag();
@@ -1017,7 +1017,7 @@ void CUDASimulation::stepLayer(const std::shared_ptr<LayerData>& layer, const un
             if (auto oa = func_des->agent_output.lock()) {
                 // This will act as a reserve word
                 // which is added to variable hashes for agent creation on device
-                CUDAAgent& output_agent = getCUDAAgent(oa->name);
+                detail::CUDAAgent& output_agent = getCUDAAgent(oa->name);
                 // Scatter the agent birth
                 output_agent.scatterNew(*func_des, state_list_size, this->singletons->scatter, streamIdx, this->getStream(streamIdx));
                 output_agent.releaseNewBuffer(*func_des);
@@ -1158,7 +1158,7 @@ void CUDASimulation::simulate() {
     initialiseSingletons();
 
     // Create the event timing object, using an appropriate timer implementation.
-    std::unique_ptr<util::detail::Timer> simulationTimer = getDriverAppropriateTimer(getCUDAConfig().is_ensemble);
+    std::unique_ptr<detail::Timer> simulationTimer = getDriverAppropriateTimer(getCUDAConfig().is_ensemble);
     simulationTimer->start();
 
     // Create as many streams as required
@@ -1240,7 +1240,7 @@ void CUDASimulation::simulate() {
     if (getSimulationConfig().telemetry) {
         // Generate some payload items
         std::map<std::string, std::string> payload_items;
-        payload_items["GPUDevices"] = flamegpu::util::detail::compute_capability::getDeviceName(deviceInitialised);
+        payload_items["GPUDevices"] = flamegpu::detail::compute_capability::getDeviceName(deviceInitialised);
         payload_items["SimTime(s)"] = std::to_string(elapsedSecondsSimulation);
         #if defined(__CUDACC_VER_MAJOR__) && defined(__CUDACC_VER_MINOR__) && defined(__CUDACC_VER_PATCH__)
             payload_items["NVCCVersion"] = std::to_string(__CUDACC_VER_MAJOR__) + "." + std::to_string(__CUDACC_VER_MINOR__) + "." + std::to_string(__CUDACC_VER_BUILD__);
@@ -1380,7 +1380,7 @@ void CUDASimulation::getPopulationData(AgentVector& population, const std::strin
     gpuErrchk(cudaDeviceSynchronize());
 }
 
-CUDAAgent& CUDASimulation::getCUDAAgent(const std::string& agent_name) const {
+detail::CUDAAgent& CUDASimulation::getCUDAAgent(const std::string& agent_name) const {
     CUDAAgentMap::const_iterator it;
     it = agent_map.find(agent_name);
 
@@ -1392,7 +1392,7 @@ CUDAAgent& CUDASimulation::getCUDAAgent(const std::string& agent_name) const {
     return *(it->second);
 }
 
-AgentInterface& CUDASimulation::getAgent(const std::string& agent_name) {
+detail::AgentInterface& CUDASimulation::getAgent(const std::string& agent_name) {
     // Ensure singletons have been initialised
     initialiseSingletons();
 
@@ -1406,7 +1406,7 @@ AgentInterface& CUDASimulation::getAgent(const std::string& agent_name) {
     return *(it->second);
 }
 
-CUDAMessage& CUDASimulation::getCUDAMessage(const std::string& message_name) const {
+detail::CUDAMessage& CUDASimulation::getCUDAMessage(const std::string& message_name) const {
     CUDAMessageMap::const_iterator it;
     it = message_map.find(message_name);
 
@@ -1491,9 +1491,9 @@ void CUDASimulation::applyConfig_derived() {
     }
 
     // Check the compute capability of the device, throw an exception if not valid for the executable.
-    if (!util::detail::compute_capability::checkComputeCapability(static_cast<int>(config.device_id))) {
-        int min_cc = util::detail::compute_capability::minimumCompiledComputeCapability();
-        int cc = util::detail::compute_capability::getComputeCapability(static_cast<int>(config.device_id));
+    if (!detail::compute_capability::checkComputeCapability(static_cast<int>(config.device_id))) {
+        int min_cc = detail::compute_capability::minimumCompiledComputeCapability();
+        int cc = detail::compute_capability::getComputeCapability(static_cast<int>(config.device_id));
         THROW exception::InvalidCUDAComputeCapability("Error application compiled for CUDA Compute Capability %d and above. Device %u is compute capability %d. Rebuild for SM_%d.", min_cc, config.device_id, cc, cc);
     }
 
@@ -1537,16 +1537,16 @@ void CUDASimulation::initialiseSingletons() {
     if (!singletonsInitialised) {
         // If the device has not been specified, also check the compute capability is OK
         // Check the compute capability of the device, throw an exception if not valid for the executable.
-        if (!util::detail::compute_capability::checkComputeCapability(static_cast<int>(config.device_id))) {
-            int min_cc = util::detail::compute_capability::minimumCompiledComputeCapability();
-            int cc = util::detail::compute_capability::getComputeCapability(static_cast<int>(config.device_id));
+        if (!detail::compute_capability::checkComputeCapability(static_cast<int>(config.device_id))) {
+            int min_cc = detail::compute_capability::minimumCompiledComputeCapability();
+            int cc = detail::compute_capability::getComputeCapability(static_cast<int>(config.device_id));
             THROW exception::InvalidCUDAComputeCapability("Error application compiled for CUDA Compute Capability %d and above. Device %u is compute capability %d. Rebuild for SM_%d.", min_cc, config.device_id, cc, cc);
         }
         gpuErrchk(cudaGetDevice(&deviceInitialised));
         // Get references to all required singleton and store in the instance.
         singletons = new Singletons((!submodel)?
-            EnvironmentManager::create(*model->environment) :
-            EnvironmentManager::create(*model->environment, mastermodel->singletons->environment, *submodel->subenvironment));
+            detail::EnvironmentManager::create(*model->environment) :
+            detail::EnvironmentManager::create(*model->environment, mastermodel->singletons->environment, *submodel->subenvironment));
 
         // Reinitialise random for this simulation instance
         singletons->rng.reseed(getSimulationConfig().random_seed);
@@ -1573,7 +1573,7 @@ void CUDASimulation::initialiseSingletons() {
         }
 
         // Store the WDDM/TCC driver mode status, for timer class decisions. Result is cached in the anon namespace to avoid multiple queries
-        deviceUsingWDDM = util::detail::wddm::deviceIsWDDM();
+        deviceUsingWDDM = detail::wddm::deviceIsWDDM();
 
 #ifdef FLAMEGPU_VISUALISATION
         if (visualisation) {
@@ -1606,7 +1606,7 @@ void CUDASimulation::initialiseRTC() {
     // Only do this once.
     if (!rtcInitialised) {
         flamegpu::util::nvtx::Range range{"CUDASimulation::initialiseRTC"};
-        std::unique_ptr<util::detail::Timer> rtcTimer(new util::detail::SteadyClockTimer());
+        std::unique_ptr<detail::Timer> rtcTimer(new detail::SteadyClockTimer());
         rtcTimer->start();
         // Build any RTC functions
         const auto& am = model->agents;
@@ -1700,7 +1700,7 @@ void CUDASimulation::processHostAgentCreation(const unsigned int streamId) {
                     if (size_req > t_bufflen) {
                         if (t_buff) {
                             free(t_buff);
-                            gpuErrchk(flamegpu::util::detail::cuda::cudaFree(dt_buff));
+                            gpuErrchk(flamegpu::detail::cuda::cudaFree(dt_buff));
                         }
                         t_buff = reinterpret_cast<char*>(malloc(size_req));
                         gpuErrchk(cudaMalloc(&dt_buff, size_req));
@@ -1724,7 +1724,7 @@ void CUDASimulation::processHostAgentCreation(const unsigned int streamId) {
     // Release temp memory
     if (t_buff) {
         free(t_buff);
-        gpuErrchk(flamegpu::util::detail::cuda::cudaFree(dt_buff));
+        gpuErrchk(flamegpu::detail::cuda::cudaFree(dt_buff));
     }
 }
 
@@ -1822,18 +1822,18 @@ void CUDASimulation::processStepLog(const double step_time_seconds) {
     if (step_count % step_log_config->frequency != 0)
         return;
     // Iterate members of step log to build the step log frame
-    std::map<std::string, util::Any> environment_log;
+    std::map<std::string, detail::Any> environment_log;
     for (const auto &prop_name : step_log_config->environment) {
         // Fetch the named environment prop
         environment_log.emplace(prop_name, singletons->environment->getPropertyAny(prop_name));
     }
-    std::map<util::StringPair, std::pair<std::map<LoggingConfig::NameReductionFn, util::Any>, unsigned int>> agents_log;
+    std::map<util::StringPair, std::pair<std::map<LoggingConfig::NameReductionFn, detail::Any>, unsigned int>> agents_log;
     for (const auto &name_state : step_log_config->agents) {
         // Create the named sub map
         const std::string &agent_name = name_state.first.first;
         const std::string &agent_state = name_state.first.second;
         HostAgentAPI host_agent = host_api->agent(agent_name, agent_state);
-        auto &agent_state_log = agents_log.emplace(name_state.first, std::make_pair(std::map<LoggingConfig::NameReductionFn, util::Any>(), UINT_MAX)).first->second;
+        auto &agent_state_log = agents_log.emplace(name_state.first, std::make_pair(std::map<LoggingConfig::NameReductionFn, detail::Any>(), UINT_MAX)).first->second;
         // Log individual variable reductions
         for (const auto &name_reduction : *name_state.second.first) {
             // Perform the corresponding reduction
@@ -1856,18 +1856,18 @@ void CUDASimulation::processExitLog() {
     if (!exit_log_config)
         return;
     // Iterate members of step log to build the step log frame
-    std::map<std::string, util::Any> environment_log;
+    std::map<std::string, detail::Any> environment_log;
     for (const auto &prop_name : exit_log_config->environment) {
         // Fetch the named environment prop
         environment_log.emplace(prop_name, singletons->environment->getPropertyAny(prop_name));
     }
-    std::map<util::StringPair, std::pair<std::map<LoggingConfig::NameReductionFn, util::Any>, unsigned int>> agents_log;
+    std::map<util::StringPair, std::pair<std::map<LoggingConfig::NameReductionFn, detail::Any>, unsigned int>> agents_log;
     for (const auto &name_state : exit_log_config->agents) {
         // Create the named sub map
         const std::string &agent_name = name_state.first.first;
         const std::string &agent_state = name_state.first.second;
         HostAgentAPI host_agent = host_api->agent(agent_name, agent_state);
-        auto &agent_state_log = agents_log.emplace(name_state.first, std::make_pair(std::map<LoggingConfig::NameReductionFn, util::Any>(), UINT_MAX)).first->second;
+        auto &agent_state_log = agents_log.emplace(name_state.first, std::make_pair(std::map<LoggingConfig::NameReductionFn, detail::Any>(), UINT_MAX)).first->second;
         // Log individual variable reductions
         for (const auto &name_reduction : *name_state.second.first) {
             // Perform the corresponding reduction
@@ -1927,7 +1927,7 @@ void CUDASimulation::destroyStreams() {
     cudaStreamDestroy and cudaStreamQuery under linux with CUDA 11.8 (and potentialy others) would occasionally segfault after a reset, so it's error code could not be relied on to check if the cudaStream was valid for the current primary context or not.
     Instead, we can use the cudaDriverAPI to check the primary context is correct / valid for the device, and if it is attempt to destory the stream. If it is not, we can assume the device has been reset or CUDA has been shutdown, so the stream has already been destroyed.
     */
-    bool ctxIsActive = flamegpu::util::detail::cuda::cuDevicePrimaryContextIsActive(deviceInitialised);
+    bool ctxIsActive = flamegpu::detail::cuda::cuDevicePrimaryContextIsActive(deviceInitialised);
     if (ctxIsActive) {
         // Destroy streams.
         for (auto stream : streams) {
@@ -1944,7 +1944,7 @@ void CUDASimulation::synchronizeAllStreams() {
     }
 }
 
-std::shared_ptr<EnvironmentManager> CUDASimulation::getEnvironment() const {
+std::shared_ptr<detail::EnvironmentManager> CUDASimulation::getEnvironment() const {
     if (singletons)
         return singletons->environment;
     return nullptr;
diff --git a/src/flamegpu/sim/LogFrame.cu b/src/flamegpu/simulation/LogFrame.cu
similarity index 83%
rename from src/flamegpu/sim/LogFrame.cu
rename to src/flamegpu/simulation/LogFrame.cu
index 94bdef33c..565b7a5f9 100644
--- a/src/flamegpu/sim/LogFrame.cu
+++ b/src/flamegpu/simulation/LogFrame.cu
@@ -1,4 +1,4 @@
-#include "flamegpu/sim/LogFrame.h"
+#include "flamegpu/simulation/LogFrame.h"
 
 namespace flamegpu {
 
@@ -6,8 +6,8 @@ LogFrame::LogFrame()
     : step_count(0) { }
 
 
-LogFrame::LogFrame(const std::map<std::string, util::Any> &_environment,
-const std::map<util::StringPair, std::pair<std::map<LoggingConfig::NameReductionFn, util::Any>, unsigned int>> &_agents,
+LogFrame::LogFrame(const std::map<std::string, detail::Any> &_environment,
+const std::map<util::StringPair, std::pair<std::map<LoggingConfig::NameReductionFn, detail::Any>, unsigned int>> &_agents,
 const unsigned int _step_count)
     : environment(_environment)
     , agents(_agents)
@@ -29,7 +29,7 @@ AgentLogFrame LogFrame::getAgent(const std::string &agent_name, const std::strin
     return AgentLogFrame(it->second.first, it->second.second);
 }
 
-AgentLogFrame::AgentLogFrame(const std::map<LoggingConfig::NameReductionFn, util::Any> &_data, const unsigned int _count)
+AgentLogFrame::AgentLogFrame(const std::map<LoggingConfig::NameReductionFn, detail::Any> &_data, const unsigned int _count)
     : data(_data)
     , count(_count) { }
 
@@ -64,8 +64,8 @@ StepLogFrame::StepLogFrame()
     , step_time(0.0) { }
 
 
-StepLogFrame::StepLogFrame(const std::map<std::string, util::Any>&& _environment,
-    const std::map<util::StringPair, std::pair<std::map<LoggingConfig::NameReductionFn, util::Any>, unsigned int>>&& _agents,
+StepLogFrame::StepLogFrame(const std::map<std::string, detail::Any>&& _environment,
+    const std::map<util::StringPair, std::pair<std::map<LoggingConfig::NameReductionFn, detail::Any>, unsigned int>>&& _agents,
     const unsigned int _step_count)
     : LogFrame(_environment, _agents, _step_count)
     , step_time(0.0) { }
@@ -78,8 +78,8 @@ ExitLogFrame::ExitLogFrame()
     , total_time(0.0) { }
 
 
-ExitLogFrame::ExitLogFrame(const std::map<std::string, util::Any>&& _environment,
-    const std::map<util::StringPair, std::pair<std::map<LoggingConfig::NameReductionFn, util::Any>, unsigned int>>&& _agents,
+ExitLogFrame::ExitLogFrame(const std::map<std::string, detail::Any>&& _environment,
+    const std::map<util::StringPair, std::pair<std::map<LoggingConfig::NameReductionFn, detail::Any>, unsigned int>>&& _agents,
     const unsigned int _step_count)
     : LogFrame(_environment, _agents, _step_count)
     , rtc_time(0.0)
diff --git a/src/flamegpu/sim/LoggingConfig.cu b/src/flamegpu/simulation/LoggingConfig.cu
similarity index 96%
rename from src/flamegpu/sim/LoggingConfig.cu
rename to src/flamegpu/simulation/LoggingConfig.cu
index ab365a436..0b26ff5a5 100644
--- a/src/flamegpu/sim/LoggingConfig.cu
+++ b/src/flamegpu/simulation/LoggingConfig.cu
@@ -1,6 +1,6 @@
-#include "flamegpu/sim/LoggingConfig.h"
+#include "flamegpu/simulation/LoggingConfig.h"
 
-#include "flamegpu/sim/AgentLoggingConfig.h"
+#include "flamegpu/simulation/AgentLoggingConfig.h"
 #include "flamegpu/model/ModelDescription.h"
 #include "flamegpu/model/ModelData.h"
 #include "flamegpu/model/AgentData.h"
diff --git a/src/flamegpu/sim/RunPlan.cpp b/src/flamegpu/simulation/RunPlan.cpp
similarity index 95%
rename from src/flamegpu/sim/RunPlan.cpp
rename to src/flamegpu/simulation/RunPlan.cpp
index 5c08143f0..c0af8ef17 100644
--- a/src/flamegpu/sim/RunPlan.cpp
+++ b/src/flamegpu/simulation/RunPlan.cpp
@@ -1,5 +1,5 @@
-#include "flamegpu/sim/RunPlan.h"
-#include "flamegpu/sim/RunPlanVector.h"
+#include "flamegpu/simulation/RunPlan.h"
+#include "flamegpu/simulation/RunPlanVector.h"
 
 #include "flamegpu/model/ModelDescription.h"
 
@@ -22,7 +22,7 @@ RunPlan& RunPlan::operator=(const RunPlan& other) {
     this->output_subdirectory = other.output_subdirectory;
     this->allow_0_steps = other.allow_0_steps;
     for (auto &i : other.property_overrides)
-        this->property_overrides.emplace(i.first, util::Any(i.second));
+        this->property_overrides.emplace(i.first, detail::Any(i.second));
     return *this;
 }
 void RunPlan::setRandomSimulationSeed(const uint64_t _random_seed) {
diff --git a/src/flamegpu/sim/RunPlanVector.cpp b/src/flamegpu/simulation/RunPlanVector.cpp
similarity index 99%
rename from src/flamegpu/sim/RunPlanVector.cpp
rename to src/flamegpu/simulation/RunPlanVector.cpp
index be82b0fc1..522a90885 100644
--- a/src/flamegpu/sim/RunPlanVector.cpp
+++ b/src/flamegpu/simulation/RunPlanVector.cpp
@@ -1,4 +1,4 @@
-#include "flamegpu/sim/RunPlanVector.h"
+#include "flamegpu/simulation/RunPlanVector.h"
 #include "flamegpu/model/ModelDescription.h"
 
 namespace flamegpu {
diff --git a/src/flamegpu/sim/Simulation.cu b/src/flamegpu/simulation/Simulation.cu
similarity index 99%
rename from src/flamegpu/sim/Simulation.cu
rename to src/flamegpu/simulation/Simulation.cu
index 4da2bf6e6..94f5f63a6 100644
--- a/src/flamegpu/sim/Simulation.cu
+++ b/src/flamegpu/simulation/Simulation.cu
@@ -1,4 +1,4 @@
-#include "flamegpu/sim/Simulation.h"
+#include "flamegpu/simulation/Simulation.h"
 
 #include <algorithm>
 #include <atomic>
@@ -11,8 +11,8 @@
 #include "flamegpu/io/StateReaderFactory.h"
 #include "flamegpu/io/StateWriterFactory.h"
 #include "flamegpu/io/LoggerFactory.h"
-#include "flamegpu/runtime/utility/RandomManager.cuh"
-#include "flamegpu/pop/AgentVector.h"
+#include "flamegpu/simulation/detail/RandomManager.cuh"
+#include "flamegpu/simulation/AgentVector.h"
 #include "flamegpu/model/AgentDescription.h"
 #include "flamegpu/util/nvtx.h"
 #include "flamegpu/model/EnvironmentData.h"
diff --git a/src/flamegpu/gpu/CUDAAgent.cu b/src/flamegpu/simulation/detail/CUDAAgent.cu
similarity index 94%
rename from src/flamegpu/gpu/CUDAAgent.cu
rename to src/flamegpu/simulation/detail/CUDAAgent.cu
index 3511125ac..e388d0d3e 100644
--- a/src/flamegpu/gpu/CUDAAgent.cu
+++ b/src/flamegpu/simulation/detail/CUDAAgent.cu
@@ -1,4 +1,4 @@
-#include "flamegpu/gpu/CUDAAgent.h"
+#include "flamegpu/simulation/detail/CUDAAgent.h"
 
 #include <cuda_runtime.h>
 #include <device_launch_parameters.h>
@@ -17,22 +17,23 @@
 #endif
 
 #include "flamegpu/version.h"
-#include "flamegpu/gpu/CUDAFatAgent.h"
-#include "flamegpu/gpu/CUDAAgentStateList.h"
-#include "flamegpu/gpu/detail/CUDAErrorChecking.cuh"
-#include "flamegpu/gpu/CUDASimulation.h"
+#include "flamegpu/simulation/detail/CUDAFatAgent.h"
+#include "flamegpu/simulation/detail/CUDAAgentStateList.h"
+#include "flamegpu/simulation/detail/CUDAErrorChecking.cuh"
+#include "flamegpu/simulation/CUDASimulation.h"
 
 #include "flamegpu/model/AgentDescription.h"
 #include "flamegpu/model/AgentFunctionDescription.h"
 #include "flamegpu/runtime/detail/curve/HostCurve.cuh"
 #include "flamegpu/runtime/detail/curve/curve_rtc.cuh"
-#include "flamegpu/gpu/CUDAScatter.cuh"
-#include "flamegpu/util/detail/compute_capability.cuh"
+#include "flamegpu/simulation/detail/CUDAScatter.cuh"
+#include "flamegpu/detail/compute_capability.cuh"
 #include "flamegpu/util/nvtx.h"
-#include "flamegpu/pop/DeviceAgentVector_impl.h"
-#include "flamegpu/util/detail/cuda.cuh"
+#include "flamegpu/runtime/agent/DeviceAgentVector_impl.h"
+#include "flamegpu/detail/cuda.cuh"
 
 namespace flamegpu {
+namespace detail {
 
 CUDAAgent::CUDAAgent(const AgentData& description, const CUDASimulation &_cudaSimulation)
     : agent_description(description)  // This is a master agent, so it must create a new fat_agent
@@ -205,7 +206,7 @@ void CUDAAgent::validateIDCollisions(cudaStream_t stream) const {
     size_t temp_storage_bytes2 = 0;
     gpuErrchk(cub::DeviceReduce::Sum(nullptr, temp_storage_bytes2, d_keysIn, d_keysOut, agentCount - 1, stream));
     if (temp_storage_bytes2 > temp_storage_bytes) {
-        gpuErrchk(flamegpu::util::detail::cuda::cudaFree(d_temp_storage));
+        gpuErrchk(flamegpu::detail::cuda::cudaFree(d_temp_storage));
         temp_storage_bytes = temp_storage_bytes2;
         gpuErrchk(cudaMalloc(&d_temp_storage, temp_storage_bytes));
     }
@@ -213,9 +214,9 @@ void CUDAAgent::validateIDCollisions(cudaStream_t stream) const {
     id_t flagsSet = 0;
     gpuErrchk(cudaMemcpyAsync(&flagsSet, d_keysOut, sizeof(id_t), cudaMemcpyDeviceToHost, stream));
     // Cleanup
-    gpuErrchk(flamegpu::util::detail::cuda::cudaFree(d_temp_storage));
-    gpuErrchk(flamegpu::util::detail::cuda::cudaFree(d_keysIn));
-    gpuErrchk(flamegpu::util::detail::cuda::cudaFree(d_keysOut));
+    gpuErrchk(flamegpu::detail::cuda::cudaFree(d_temp_storage));
+    gpuErrchk(flamegpu::detail::cuda::cudaFree(d_keysIn));
+    gpuErrchk(flamegpu::detail::cuda::cudaFree(d_keysOut));
     if (flagsSet) {
         THROW exception::AgentIDCollision("%u agents of type '%s' share an ID with another agent of the same type, "
             "you may need to explicitly reset agent IDs for 1 or more populations before adding them to the CUDASimulation, "
@@ -289,25 +290,25 @@ void *CUDAAgent::getStateVariablePtr(const std::string &state_name, const std::s
     }
     return sm->second->getVariablePointer(variable_name);
 }
-void CUDAAgent::processDeath(const AgentFunctionData& func, CUDAScatter &scatter, const unsigned int streamId, const cudaStream_t stream) {
+void CUDAAgent::processDeath(const AgentFunctionData& func, detail::CUDAScatter &scatter, const unsigned int streamId, const cudaStream_t stream) {
     // Optionally process agent death
     if (func.has_agent_death) {
         // Agent death operates on all mapped vars, so handled by fat agent
         fat_agent->processDeath(fat_index, func.initial_state, scatter, streamId, stream);
     }
 }
-void CUDAAgent::transitionState(const std::string &_src, const std::string &_dest, CUDAScatter &scatter, const unsigned int streamId, const cudaStream_t stream) {
+void CUDAAgent::transitionState(const std::string &_src, const std::string &_dest, detail::CUDAScatter &scatter, const unsigned int streamId, const cudaStream_t stream) {
     // All mapped vars need to transition too, so handled by fat agent
     fat_agent->transitionState(fat_index, _src, _dest, scatter, streamId, stream);
 }
-void CUDAAgent::processFunctionCondition(const AgentFunctionData& func, CUDAScatter &scatter, const unsigned int streamId, const cudaStream_t stream) {
+void CUDAAgent::processFunctionCondition(const AgentFunctionData& func, detail::CUDAScatter &scatter, const unsigned int streamId, const cudaStream_t stream) {
     // Optionally process function condition
     if ((func.condition) || (!func.rtc_func_condition_name.empty())) {
         // Agent function condition operates on all mapped vars, so handled by fat agent
         fat_agent->processFunctionCondition(fat_index, func.initial_state, scatter, streamId, stream);
     }
 }
-void CUDAAgent::scatterHostCreation(const std::string &state_name, const unsigned int newSize, char *const d_inBuff, const VarOffsetStruct &offsets, CUDAScatter &scatter, const unsigned int streamId, const cudaStream_t stream) {
+void CUDAAgent::scatterHostCreation(const std::string &state_name, const unsigned int newSize, char *const d_inBuff, const VarOffsetStruct &offsets, detail::CUDAScatter &scatter, const unsigned int streamId, const cudaStream_t stream) {
     auto sm = state_map.find(state_name);
     if (sm == state_map.end()) {
         THROW exception::InvalidCudaAgentState("Error: Agent ('%s') state ('%s') was not found "
@@ -316,7 +317,7 @@ void CUDAAgent::scatterHostCreation(const std::string &state_name, const unsigne
     }
     sm->second->scatterHostCreation(newSize, d_inBuff, offsets, scatter, streamId, stream);
 }
-void CUDAAgent::scatterSort_async(const std::string &state_name, CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream) {
+void CUDAAgent::scatterSort_async(const std::string &state_name, detail::CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream) {
     auto sm = state_map.find(state_name);
     if (sm == state_map.end()) {
         THROW exception::InvalidCudaAgentState("Error: Agent ('%s') state ('%s') was not found "
@@ -325,7 +326,7 @@ void CUDAAgent::scatterSort_async(const std::string &state_name, CUDAScatter &sc
     }
     sm->second->scatterSort_async(scatter, streamId, stream);
 }
-void CUDAAgent::mapNewRuntimeVariables_async(const CUDAAgent& func_agent, const AgentFunctionData& func, unsigned int maxLen, CUDAScatter &scatter, unsigned int instance_id, cudaStream_t stream, unsigned int streamId) {
+void CUDAAgent::mapNewRuntimeVariables_async(const CUDAAgent& func_agent, const AgentFunctionData& func, unsigned int maxLen, detail::CUDAScatter &scatter, unsigned int instance_id, cudaStream_t stream, unsigned int streamId) {
     // Confirm agent output is set
     if (auto oa = func.agent_output.lock()) {
         // check the cuda agent state map to find the correct state list for functions starting state
@@ -408,7 +409,7 @@ void CUDAAgent::releaseNewBuffer(const AgentFunctionData& func) {
     }
 }
 
-void CUDAAgent::scatterNew(const AgentFunctionData& func, const unsigned int newSize, CUDAScatter &scatter, const unsigned int streamId, const cudaStream_t stream) {
+void CUDAAgent::scatterNew(const AgentFunctionData& func, const unsigned int newSize, detail::CUDAScatter &scatter, const unsigned int streamId, const cudaStream_t stream) {
     // Confirm agent output is set
     if (auto oa = func.agent_output.lock()) {
         auto sm = state_map.find(func.agent_output_state);
@@ -530,7 +531,7 @@ void CUDAAgent::addInstantitateRTCFunction(const AgentFunctionData& func, const
         agent_function_file.close();
 #endif
 
-    util::detail::JitifyCache &jitify = util::detail::JitifyCache::getInstance();
+    detail::JitifyCache &jitify = detail::JitifyCache::getInstance();
     // switch between normal agent function and agent function condition
     if (!function_condition) {
         const std::string t_func_impl = std::string(func.rtc_func_name).append("_impl");
@@ -631,7 +632,7 @@ const CUDAAgent::CUDARTCFuncMap& CUDAAgent::getRTCFunctions() const {
     return rtc_func_map;
 }
 
-void CUDAAgent::initUnmappedVars(CUDAScatter &scatter, const unsigned int streamId, const cudaStream_t stream) {
+void CUDAAgent::initUnmappedVars(detail::CUDAScatter &scatter, const unsigned int streamId, const cudaStream_t stream) {
     for (auto &s : state_map) {
         s.second->initUnmappedVars(scatter, streamId, stream);
     }
@@ -680,7 +681,7 @@ id_t CUDAAgent::nextID(unsigned int count) {
 id_t* CUDAAgent::getDeviceNextID() {
     return fat_agent->getDeviceNextID();
 }
-void CUDAAgent::assignIDs(HostAPI& hostapi, CUDAScatter &scatter, cudaStream_t stream, const unsigned int streamId) {
+void CUDAAgent::assignIDs(HostAPI& hostapi, detail::CUDAScatter &scatter, cudaStream_t stream, const unsigned int streamId) {
     fat_agent->assignIDs(hostapi, scatter, stream, streamId);
 }
 
@@ -703,4 +704,5 @@ void CUDAAgent::resetPopulationVecs() {
     population_dvec.clear();
 }
 
+}  // namespace detail
 }  // namespace flamegpu
diff --git a/src/flamegpu/gpu/CUDAAgentStateList.cu b/src/flamegpu/simulation/detail/CUDAAgentStateList.cu
similarity index 93%
rename from src/flamegpu/gpu/CUDAAgentStateList.cu
rename to src/flamegpu/simulation/detail/CUDAAgentStateList.cu
index 16cee3ddc..7fb82b710 100644
--- a/src/flamegpu/gpu/CUDAAgentStateList.cu
+++ b/src/flamegpu/simulation/detail/CUDAAgentStateList.cu
@@ -1,14 +1,14 @@
-#include "flamegpu/gpu/CUDAAgentStateList.h"
+#include "flamegpu/simulation/detail/CUDAAgentStateList.h"
 
 #include <cuda_runtime.h>
 #include <device_launch_parameters.h>
 
-#include "flamegpu/gpu/CUDAAgent.h"
-#include "flamegpu/gpu/detail/CUDAErrorChecking.cuh"
-#include "flamegpu/pop/AgentVector.h"
+#include "flamegpu/simulation/detail/CUDAAgent.h"
+#include "flamegpu/simulation/detail/CUDAErrorChecking.cuh"
+#include "flamegpu/simulation/AgentVector.h"
 #include "flamegpu/model/AgentDescription.h"
-#include "flamegpu/gpu/CUDAScatter.cuh"
-#include "flamegpu/runtime/HostNewAgentAPI.h"
+#include "flamegpu/simulation/detail/CUDAScatter.cuh"
+#include "flamegpu/runtime/agent/HostNewAgentAPI.h"
 #include "flamegpu/exception/FLAMEGPUException.h"
 
 #ifdef _MSC_VER
@@ -21,6 +21,7 @@
 #endif  // _MSC_VER
 
 namespace flamegpu {
+namespace detail {
 
 CUDAAgentStateList::CUDAAgentStateList(
     const std::shared_ptr<CUDAFatAgentStateList> &fat_list,
@@ -138,7 +139,7 @@ void CUDAAgentStateList::getAgentData(AgentVector& population) const {
     }
     population._size = data_count;  // Private AgentVector::resize() does not update size
 }
-void CUDAAgentStateList::scatterHostCreation(unsigned int newSize, char* const d_inBuff, const VarOffsetStruct & offsets, CUDAScatter & scatter, const unsigned int streamId, const cudaStream_t stream) {
+void CUDAAgentStateList::scatterHostCreation(unsigned int newSize, char* const d_inBuff, const VarOffsetStruct & offsets, detail::CUDAScatter & scatter, const unsigned int streamId, const cudaStream_t stream) {
     // Resize agent list if required
     parent_list->resize(parent_list->getSizeWithDisabled() + newSize, true, stream);
     // Build scatter data
@@ -166,10 +167,10 @@ void CUDAAgentStateList::scatterHostCreation(unsigned int newSize, char* const d
     // Update number of alive agents
     parent_list->setAgentCount(parent_list->getSize() + newSize);
 }
-void CUDAAgentStateList::scatterSort_async(CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream) {
+void CUDAAgentStateList::scatterSort_async(detail::CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream) {
     parent_list->scatterSort_async(scatter, streamId, stream);
 }
-unsigned int CUDAAgentStateList::scatterNew(void * d_newBuff, const unsigned int newSize, CUDAScatter &scatter, const unsigned int streamId, const cudaStream_t stream) {
+unsigned int CUDAAgentStateList::scatterNew(void * d_newBuff, const unsigned int newSize, detail::CUDAScatter &scatter, const unsigned int streamId, const cudaStream_t stream) {
     if (newSize) {
         CUDAScanCompactionConfig &scanCfg = scatter.Scan().Config(CUDAScanCompaction::Type::AGENT_OUTPUT, streamId);
         // Check if we need to resize cub storage
@@ -234,7 +235,7 @@ unsigned int CUDAAgentStateList::scatterNew(void * d_newBuff, const unsigned int
 bool CUDAAgentStateList::getIsSubStatelist() {
     return isSubStateList;
 }
-void CUDAAgentStateList::initUnmappedVars(CUDAScatter &scatter, const unsigned int streamId, const cudaStream_t stream) {
+void CUDAAgentStateList::initUnmappedVars(detail::CUDAScatter &scatter, const unsigned int streamId, const cudaStream_t stream) {
     assert(parent_list->getSizeWithDisabled() == parent_list->getSize());
     if (parent_list->getSize()) {
         assert(isSubStateList);
@@ -263,4 +264,5 @@ std::list<std::shared_ptr<VariableBuffer>> CUDAAgentStateList::getUnboundVariabl
     return parent_list->getBuffers(exclusionSet);
 }
 
+}  // namespace detail
 }  // namespace flamegpu
diff --git a/src/flamegpu/gpu/CUDAFatAgent.cu b/src/flamegpu/simulation/detail/CUDAFatAgent.cu
similarity index 94%
rename from src/flamegpu/gpu/CUDAFatAgent.cu
rename to src/flamegpu/simulation/detail/CUDAFatAgent.cu
index 59fb16dc4..af440792f 100644
--- a/src/flamegpu/gpu/CUDAFatAgent.cu
+++ b/src/flamegpu/simulation/detail/CUDAFatAgent.cu
@@ -1,9 +1,9 @@
-#include "flamegpu/gpu/CUDAFatAgent.h"
+#include "flamegpu/simulation/detail/CUDAFatAgent.h"
 
-#include "flamegpu/gpu/CUDAScatter.cuh"
+#include "flamegpu/simulation/detail/CUDAScatter.cuh"
 #include "flamegpu/runtime/HostAPI.h"
 #include "flamegpu/util/nvtx.h"
-#include "flamegpu/util/detail/cuda.cuh"
+#include "flamegpu/detail/cuda.cuh"
 
 #ifdef _MSC_VER
 #pragma warning(push, 1)
@@ -15,6 +15,7 @@
 #endif
 
 namespace flamegpu {
+namespace detail {
 
 CUDAFatAgent::CUDAFatAgent(const AgentData& description)
     : mappedAgentCount(0)
@@ -33,10 +34,10 @@ CUDAFatAgent::CUDAFatAgent(const AgentData& description)
 }
 CUDAFatAgent::~CUDAFatAgent() {
     if (d_nextID) {
-        gpuErrchk(flamegpu::util::detail::cuda::cudaFree(d_nextID));
+        gpuErrchk(flamegpu::detail::cuda::cudaFree(d_nextID));
     }
     for (auto &b : d_newLists) {
-        gpuErrchk(flamegpu::util::detail::cuda::cudaFree(b.data));
+        gpuErrchk(flamegpu::detail::cuda::cudaFree(b.data));
     }
     d_newLists.clear();
 }
@@ -85,7 +86,7 @@ void CUDAFatAgent::addSubAgent(
     mappedAgentCount++;
 }
 
-void CUDAFatAgent::processDeath(const unsigned int agent_fat_id, const std::string &state_name, CUDAScatter &scatter, const unsigned int streamId, const cudaStream_t stream) {
+void CUDAFatAgent::processDeath(const unsigned int agent_fat_id, const std::string &state_name, detail::CUDAScatter &scatter, const unsigned int streamId, const cudaStream_t stream) {
     auto sm = states.find({agent_fat_id, state_name});
     if (sm == states.end()) {
         THROW exception::InvalidCudaAgentState("Error: Agent ('%s') state ('%s') was not found "
@@ -119,7 +120,7 @@ void CUDAFatAgent::processDeath(const unsigned int agent_fat_id, const std::stri
     sm->second->scatterDeath(scatter, streamId, stream);
 }
 
-void CUDAFatAgent::transitionState(unsigned int agent_fat_id, const std::string &_src, const std::string &_dest, CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream) {
+void CUDAFatAgent::transitionState(unsigned int agent_fat_id, const std::string &_src, const std::string &_dest, detail::CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream) {
     // Optionally process state transition
     if (_src != _dest) {
         auto src = states.find({agent_fat_id, _src});
@@ -166,7 +167,7 @@ void CUDAFatAgent::transitionState(unsigned int agent_fat_id, const std::string
     }
 }
 
-void CUDAFatAgent::processFunctionCondition(const unsigned int agent_fat_id, const std::string &state_name, CUDAScatter &scatter, const unsigned int streamId, const cudaStream_t stream) {
+void CUDAFatAgent::processFunctionCondition(const unsigned int agent_fat_id, const std::string &state_name, detail::CUDAScatter &scatter, const unsigned int streamId, const cudaStream_t stream) {
     auto sm = states.find({agent_fat_id, state_name});
     if (sm == states.end()) {
         THROW exception::InvalidCudaAgentState("Error: Agent ('%s') state ('%s') was not found "
@@ -256,7 +257,7 @@ void *CUDAFatAgent::allocNewBuffer(const size_t total_agent_size, const unsigned
             NewBuffer my_b = b;
             // Erase and resize/reinsert to d_newLists to mark as in use
             d_newLists.erase(b);
-            gpuErrchk(flamegpu::util::detail::cuda::cudaFree(my_b.data));
+            gpuErrchk(flamegpu::detail::cuda::cudaFree(my_b.data));
             gpuErrchk(cudaMalloc(&my_b.data, ALLOCATION_SIZE));
             my_b.size = ALLOCATION_SIZE;
             my_b.in_use = true;
@@ -327,7 +328,7 @@ void CUDAFatAgent::notifyDeviceBirths(unsigned int newCount) {
     assert(t == _nextID);  // At the end of device birth they should be equal, as no host birth can occur between pre and post processing agent fn
 #endif
 }
-void CUDAFatAgent::assignIDs(HostAPI& hostapi, CUDAScatter &scatter, cudaStream_t stream, const unsigned int streamId) {
+void CUDAFatAgent::assignIDs(HostAPI& hostapi, detail::CUDAScatter &scatter, cudaStream_t stream, const unsigned int streamId) {
     flamegpu::util::nvtx::Range range{"CUDAFatAgent::assignIDs"};
     if (agent_ids_have_init) return;
     id_t h_max = ID_NOT_SET;
@@ -380,4 +381,5 @@ void CUDAFatAgent::resetIDCounter() {
     _nextID = ID_NOT_SET + 1;
 }
 
+}  // namespace detail
 }  // namespace flamegpu
diff --git a/src/flamegpu/gpu/CUDAFatAgentStateList.cu b/src/flamegpu/simulation/detail/CUDAFatAgentStateList.cu
similarity index 91%
rename from src/flamegpu/gpu/CUDAFatAgentStateList.cu
rename to src/flamegpu/simulation/detail/CUDAFatAgentStateList.cu
index 4ae7f3502..6cf2bd64e 100644
--- a/src/flamegpu/gpu/CUDAFatAgentStateList.cu
+++ b/src/flamegpu/simulation/detail/CUDAFatAgentStateList.cu
@@ -1,8 +1,9 @@
-#include "flamegpu/gpu/CUDAFatAgentStateList.h"
-#include "flamegpu/gpu/CUDAScatter.cuh"
-#include "flamegpu/util/detail/cuda.cuh"
+#include "flamegpu/simulation/detail/CUDAFatAgentStateList.h"
+#include "flamegpu/simulation/detail/CUDAScatter.cuh"
+#include "flamegpu/detail/cuda.cuh"
 
 namespace flamegpu {
+namespace detail {
 
 CUDAFatAgentStateList::CUDAFatAgentStateList(const AgentData& description)
     : aliveAgents(0)
@@ -39,8 +40,8 @@ CUDAFatAgentStateList::CUDAFatAgentStateList(const CUDAFatAgentStateList& other)
 }
 CUDAFatAgentStateList::~CUDAFatAgentStateList() {
     for (const auto &buff : variables_unique) {
-        gpuErrchk(flamegpu::util::detail::cuda::cudaFree(buff->data));
-        gpuErrchk(flamegpu::util::detail::cuda::cudaFree(buff->data_swap));
+        gpuErrchk(flamegpu::detail::cuda::cudaFree(buff->data));
+        gpuErrchk(flamegpu::detail::cuda::cudaFree(buff->data_swap));
     }
 }
 void CUDAFatAgentStateList::addSubAgentVariables(
@@ -81,7 +82,7 @@ void CUDAFatAgentStateList::resize(const unsigned int minSize, const bool retain
         const size_t var_size = buff->type_size * buff->elements;
         const size_t buff_size = var_size * newSize;
         // Free old swap buffer
-        gpuErrchk(flamegpu::util::detail::cuda::cudaFree(buff->data_swap));
+        gpuErrchk(flamegpu::detail::cuda::cudaFree(buff->data_swap));
         // Allocate new buffer to swap
         gpuErrchk(cudaMalloc(&buff->data_swap, buff_size));
         // Copy old data to new buffer in swap
@@ -107,7 +108,7 @@ void CUDAFatAgentStateList::resize(const unsigned int minSize, const bool retain
         // Swap buffers
         std::swap(buff->data_swap, buff->data);
         // Free old swap buffer
-        gpuErrchk(flamegpu::util::detail::cuda::cudaFree(buff->data_swap));
+        gpuErrchk(flamegpu::detail::cuda::cudaFree(buff->data_swap));
         // Allocate new buffer to swap
         gpuErrchk(cudaMalloc(&buff->data_swap, buff_size));
         // Update condition list
@@ -143,7 +144,7 @@ void CUDAFatAgentStateList::setAgentCount(const unsigned int newCount, const boo
     }
     aliveAgents = disabledAgents + newCount;
 }
-unsigned int CUDAFatAgentStateList::scatterDeath(CUDAScatter &scatter, const unsigned int streamId, const cudaStream_t stream) {
+unsigned int CUDAFatAgentStateList::scatterDeath(detail::CUDAScatter &scatter, const unsigned int streamId, const cudaStream_t stream) {
     // Build scatter data
     std::vector<CUDAScatter::ScatterData> sd;
     for (const auto &v : variables_unique) {
@@ -167,7 +168,7 @@ unsigned int CUDAFatAgentStateList::scatterDeath(CUDAScatter &scatter, const uns
 
     return living_agents;
 }
-unsigned int CUDAFatAgentStateList::scatterAgentFunctionConditionFalse(CUDAScatter &scatter, const unsigned int streamId, const cudaStream_t stream) {
+unsigned int CUDAFatAgentStateList::scatterAgentFunctionConditionFalse(detail::CUDAScatter &scatter, const unsigned int streamId, const cudaStream_t stream) {
     // This makes no sense if we have disabled agents (it's supposed to reorder to create disabled agents)
     assert(disabledAgents == 0);
     // Build scatter data
@@ -184,7 +185,7 @@ unsigned int CUDAFatAgentStateList::scatterAgentFunctionConditionFalse(CUDAScatt
         aliveAgents, 0, false, disabledAgents);
     return scattered_agents;
 }
-unsigned int CUDAFatAgentStateList::scatterAgentFunctionConditionTrue(const unsigned int conditionFailCount, CUDAScatter &scatter, const unsigned int streamId, const cudaStream_t stream) {
+unsigned int CUDAFatAgentStateList::scatterAgentFunctionConditionTrue(const unsigned int conditionFailCount, detail::CUDAScatter &scatter, const unsigned int streamId, const cudaStream_t stream) {
     // This makes no sense if we have disabled agents (it's suppose to reorder to create disabled agents)
     assert(disabledAgents == 0);
     // Build scatter data
@@ -216,7 +217,7 @@ void CUDAFatAgentStateList::setDisabledAgents(const unsigned int numberOfDisable
         v->data_condition = data_p + (numberOfDisabled * v->type_size * v->elements);
     }
 }
-void CUDAFatAgentStateList::scatterSort_async(CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream) {
+void CUDAFatAgentStateList::scatterSort_async(detail::CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream) {
     // This is not designed to run when there are disabled agents
     assert(disabledAgents == 0);
     // Build scatter data
@@ -232,7 +233,7 @@ void CUDAFatAgentStateList::scatterSort_async(CUDAScatter &scatter, unsigned int
     }
     scatter.scatterPosition_async(streamId, stream, CUDAScatter::Type::MESSAGE_OUTPUT, sd, aliveAgents);
 }
-void CUDAFatAgentStateList::initVariables(std::set<std::shared_ptr<VariableBuffer>> &exclusionSet, const unsigned int initCount, const unsigned initOffset, CUDAScatter &scatter, const unsigned int streamId, const cudaStream_t stream) {
+void CUDAFatAgentStateList::initVariables(std::set<std::shared_ptr<VariableBuffer>> &exclusionSet, const unsigned int initCount, const unsigned initOffset, detail::CUDAScatter &scatter, const unsigned int streamId, const cudaStream_t stream) {
     if (initCount && exclusionSet.size()) {
         assert(initCount + initOffset <= bufferLen);
         std::list<std::shared_ptr<VariableBuffer>> initVars;
@@ -267,4 +268,5 @@ std::list<std::shared_ptr<VariableBuffer>> CUDAFatAgentStateList::getBuffers(std
     return returnVars;
 }
 
+}  // namespace detail
 }  // namespace flamegpu
diff --git a/src/flamegpu/gpu/CUDAMacroEnvironment.cu b/src/flamegpu/simulation/detail/CUDAMacroEnvironment.cu
similarity index 96%
rename from src/flamegpu/gpu/CUDAMacroEnvironment.cu
rename to src/flamegpu/simulation/detail/CUDAMacroEnvironment.cu
index 8da72f230..75eebf5fd 100644
--- a/src/flamegpu/gpu/CUDAMacroEnvironment.cu
+++ b/src/flamegpu/simulation/detail/CUDAMacroEnvironment.cu
@@ -1,13 +1,14 @@
-#include "flamegpu/gpu/CUDAMacroEnvironment.h"
+#include "flamegpu/simulation/detail/CUDAMacroEnvironment.h"
 
 #include "flamegpu/model/EnvironmentDescription.h"
-#include "flamegpu/gpu/CUDASimulation.h"
+#include "flamegpu/simulation/CUDASimulation.h"
 #include "flamegpu/model/AgentFunctionData.cuh"
 #include "flamegpu/model/SubEnvironmentData.h"
 #include "flamegpu/runtime/detail/curve/curve_rtc.cuh"
-#include "flamegpu/util/detail/cuda.cuh"
+#include "flamegpu/detail/cuda.cuh"
 
 namespace flamegpu {
+namespace detail {
 
 CUDAMacroEnvironment::CUDAMacroEnvironment(const EnvironmentData& description, const CUDASimulation& _cudaSimulation)
     : cudaSimulation(_cudaSimulation) {
@@ -74,7 +75,7 @@ void CUDAMacroEnvironment::free() {
     for (auto& prop : properties) {
         if (prop.second.d_ptr) {
             if (!prop.second.is_sub) {
-                gpuErrchk(flamegpu::util::detail::cuda::cudaFree(prop.second.d_ptr));
+                gpuErrchk(flamegpu::detail::cuda::cudaFree(prop.second.d_ptr));
             }
             prop.second.d_ptr = nullptr;
         }
@@ -163,4 +164,5 @@ unsigned int CUDAMacroEnvironment::getDeviceRWFlags(const std::string& property_
     return ret;
 }
 #endif
+}  // namespace detail
 }  // namespace flamegpu
diff --git a/src/flamegpu/gpu/CUDAMessage.cu b/src/flamegpu/simulation/detail/CUDAMessage.cu
similarity index 91%
rename from src/flamegpu/gpu/CUDAMessage.cu
rename to src/flamegpu/simulation/detail/CUDAMessage.cu
index c51300236..8e7a89a7d 100644
--- a/src/flamegpu/gpu/CUDAMessage.cu
+++ b/src/flamegpu/simulation/detail/CUDAMessage.cu
@@ -1,11 +1,11 @@
 #include <cuda_runtime.h>
 #include <device_launch_parameters.h>
 
-#include "flamegpu/gpu/CUDAMessage.h"
-#include "flamegpu/gpu/CUDAAgent.h"
-#include "flamegpu/gpu/CUDAMessageList.h"
-#include "flamegpu/gpu/detail/CUDAErrorChecking.cuh"
-#include "flamegpu/gpu/CUDAScatter.cuh"
+#include "flamegpu/simulation/detail/CUDAMessage.h"
+#include "flamegpu/simulation/detail/CUDAAgent.h"
+#include "flamegpu/simulation/detail/CUDAMessageList.h"
+#include "flamegpu/simulation/detail/CUDAErrorChecking.cuh"
+#include "flamegpu/simulation/detail/CUDAScatter.cuh"
 
 #include "flamegpu/runtime/messaging/MessageBruteForce.h"
 #include "flamegpu/model/AgentFunctionDescription.h"
@@ -23,6 +23,7 @@
 #endif
 
 namespace flamegpu {
+namespace detail {
 
 CUDAMessage::CUDAMessage(const MessageBruteForce::Data& description, const CUDASimulation& cudaSimulation)
     : message_description(description)
@@ -44,7 +45,7 @@ const MessageBruteForce::Data& CUDAMessage::getMessageData() const {
     return message_description;
 }
 
-void CUDAMessage::resize(unsigned int newSize, CUDAScatter &scatter, cudaStream_t stream, unsigned int streamId, unsigned int keepLen) {
+void CUDAMessage::resize(unsigned int newSize, detail::CUDAScatter &scatter, cudaStream_t stream, unsigned int streamId, unsigned int keepLen) {
     // Only grow currently
     if (newSize > max_list_size) {
         const unsigned int _keep_len = std::min(max_list_size, keepLen);
@@ -75,7 +76,7 @@ void CUDAMessage::setMessageCount(const unsigned int _message_count) {
     }
     message_count = _message_count;
 }
-void CUDAMessage::init(CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream) {
+void CUDAMessage::init(detail::CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream) {
     specialisation_handler->init(scatter, streamId, stream);
 }
 void CUDAMessage::zeroAllMessageData(cudaStream_t stream) {
@@ -151,7 +152,7 @@ void CUDAMessage::mapWriteRuntimeVariables(const AgentFunctionData& func, const
     specialisation_handler->allocateMetaDataDevicePtr(stream);
 }
 
-void CUDAMessage::swap(bool isOptional, unsigned int newMessageCount, CUDAScatter &scatter, cudaStream_t stream, unsigned int streamId) {
+void CUDAMessage::swap(bool isOptional, unsigned int newMessageCount, detail::CUDAScatter &scatter, cudaStream_t stream, unsigned int streamId) {
     if (!message_list) {
         THROW exception::InvalidMessageData("MessageList '%s' is not yet allocated, in CUDAMessage::swap()\n", message_description.name.c_str());
     }
@@ -196,7 +197,7 @@ void CUDAMessage::swap() {
     message_list->swap();
 }
 
-void CUDAMessage::buildIndex(CUDAScatter &scatter, const unsigned int streamId, const cudaStream_t stream) {
+void CUDAMessage::buildIndex(detail::CUDAScatter &scatter, const unsigned int streamId, const cudaStream_t stream) {
     // Build the index if required.
     if (pbm_construction_required) {
         specialisation_handler->buildIndex(scatter, streamId, stream);
@@ -207,4 +208,5 @@ const void *CUDAMessage::getMetaDataDevicePtr() const {
     return specialisation_handler->getMetaDataDevicePtr();
 }
 
+}  // namespace detail
 }  // namespace flamegpu
diff --git a/src/flamegpu/gpu/CUDAMessageList.cu b/src/flamegpu/simulation/detail/CUDAMessageList.cu
similarity index 89%
rename from src/flamegpu/gpu/CUDAMessageList.cu
rename to src/flamegpu/simulation/detail/CUDAMessageList.cu
index 5e57242e2..367ad022d 100644
--- a/src/flamegpu/gpu/CUDAMessageList.cu
+++ b/src/flamegpu/simulation/detail/CUDAMessageList.cu
@@ -1,21 +1,22 @@
 #include <cuda_runtime.h>
 #include <device_launch_parameters.h>
 
-#include "flamegpu/gpu/CUDAMessageList.h"
+#include "flamegpu/simulation/detail/CUDAMessageList.h"
 
-#include "flamegpu/gpu/CUDAMessage.h"
-#include "flamegpu/gpu/detail/CUDAErrorChecking.cuh"
+#include "flamegpu/simulation/detail/CUDAMessage.h"
+#include "flamegpu/simulation/detail/CUDAErrorChecking.cuh"
 #include "flamegpu/runtime/messaging/MessageBruteForce/MessageBruteForceHost.h"
-#include "flamegpu/gpu/CUDAScatter.cuh"
-#include "flamegpu/util/detail/cuda.cuh"
+#include "flamegpu/simulation/detail/CUDAScatter.cuh"
+#include "flamegpu/detail/cuda.cuh"
 
 namespace flamegpu {
+namespace detail {
 
 /**
 * CUDAMessageList class
 * @brief populates CUDA message map
 */
-CUDAMessageList::CUDAMessageList(CUDAMessage& cuda_message, CUDAScatter &scatter, cudaStream_t stream, unsigned int streamId)
+CUDAMessageList::CUDAMessageList(CUDAMessage& cuda_message, detail::CUDAScatter &scatter, cudaStream_t stream, unsigned int streamId)
     : message(cuda_message) {
     // allocate message lists
     allocateDeviceMessageList(d_list);
@@ -97,7 +98,7 @@ void CUDAMessageList::releaseDeviceMessageList(CUDAMessageMap& memory_map) {
     // for each device pointer in the cuda memory map we need to free these
     for (const auto &mm : memory_map) {
         // free the memory on the device
-        gpuErrchk(flamegpu::util::detail::cuda::cudaFree(mm.second));
+        gpuErrchk(flamegpu::detail::cuda::cudaFree(mm.second));
     }
     memory_map.clear();
 }
@@ -148,7 +149,7 @@ void CUDAMessageList::swap() {
     std::swap(d_list, d_swap_list);
 }
 
-unsigned int CUDAMessageList::scatter(unsigned int newCount, CUDAScatter &scatter, cudaStream_t stream, unsigned int streamId, bool append) {
+unsigned int CUDAMessageList::scatter(unsigned int newCount, detail::CUDAScatter &scatter, cudaStream_t stream, unsigned int streamId, bool append) {
     if (append) {
         unsigned int oldCount = message.getMessageCount();
         return oldCount + scatter.scatter(streamId,
@@ -168,7 +169,7 @@ unsigned int CUDAMessageList::scatter(unsigned int newCount, CUDAScatter &scatte
             0);
     }
 }
-unsigned int CUDAMessageList::scatterAll(unsigned int newCount, CUDAScatter &scatter, cudaStream_t stream, unsigned int streamId) {
+unsigned int CUDAMessageList::scatterAll(unsigned int newCount, detail::CUDAScatter &scatter, cudaStream_t stream, unsigned int streamId) {
     unsigned int oldCount = message.getMessageCount();
     return oldCount + scatter.scatterAll(streamId,
         stream,
@@ -178,4 +179,5 @@ unsigned int CUDAMessageList::scatterAll(unsigned int newCount, CUDAScatter &sca
         oldCount);
 }
 
+}  // namespace detail
 }  // namespace flamegpu
diff --git a/src/flamegpu/gpu/CUDAScanCompaction.cu b/src/flamegpu/simulation/detail/CUDAScanCompaction.cu
similarity index 83%
rename from src/flamegpu/gpu/CUDAScanCompaction.cu
rename to src/flamegpu/simulation/detail/CUDAScanCompaction.cu
index 66896acf1..5cc66aa3f 100644
--- a/src/flamegpu/gpu/CUDAScanCompaction.cu
+++ b/src/flamegpu/simulation/detail/CUDAScanCompaction.cu
@@ -1,11 +1,12 @@
 #include <cassert>
 
-#include "flamegpu/gpu/CUDAScanCompaction.h"
-#include "flamegpu/gpu/detail/CUDAErrorChecking.cuh"
-#include "flamegpu/gpu/CUDASimulation.h"
-#include "flamegpu/util/detail/cuda.cuh"
+#include "flamegpu/simulation/detail/CUDAScanCompaction.h"
+#include "flamegpu/simulation/detail/CUDAErrorChecking.cuh"
+#include "flamegpu/simulation/CUDASimulation.h"
+#include "flamegpu/detail/cuda.cuh"
 
 namespace flamegpu {
+namespace detail {
 
 /**
  * CUDAScanCompaction methods
@@ -36,11 +37,11 @@ CUDAScanCompactionConfig::~CUDAScanCompactionConfig() {
 }
 void CUDAScanCompactionConfig::free_scan_flag() {
     if (d_ptrs.scan_flag) {
-        gpuErrchk(flamegpu::util::detail::cuda::cudaFree(d_ptrs.scan_flag));
+        gpuErrchk(flamegpu::detail::cuda::cudaFree(d_ptrs.scan_flag));
         d_ptrs.scan_flag = nullptr;
     }
     if (d_ptrs.position) {
-        gpuErrchk(flamegpu::util::detail::cuda::cudaFree(d_ptrs.position));
+        gpuErrchk(flamegpu::detail::cuda::cudaFree(d_ptrs.position));
         d_ptrs.position = nullptr;
     }
 }
@@ -63,4 +64,5 @@ void CUDAScanCompactionConfig::resize_scan_flag(const unsigned int count) {
     }
 }
 
+}  // namespace detail
 }  // namespace flamegpu
diff --git a/src/flamegpu/gpu/CUDAScatter.cu b/src/flamegpu/simulation/detail/CUDAScatter.cu
similarity index 98%
rename from src/flamegpu/gpu/CUDAScatter.cu
rename to src/flamegpu/simulation/detail/CUDAScatter.cu
index b27166ec9..735b878c4 100644
--- a/src/flamegpu/gpu/CUDAScatter.cu
+++ b/src/flamegpu/simulation/detail/CUDAScatter.cu
@@ -1,12 +1,12 @@
-#include "flamegpu/gpu/CUDAScatter.cuh"
+#include "flamegpu/simulation/detail/CUDAScatter.cuh"
 
 #include <cuda_runtime.h>
 #include <vector>
 #include <cassert>
 
-#include "flamegpu/gpu/detail/CUDAErrorChecking.cuh"
-#include "flamegpu/gpu/CUDAFatAgentStateList.h"
-#include "flamegpu/util/detail/cuda.cuh"
+#include "flamegpu/simulation/detail/CUDAErrorChecking.cuh"
+#include "flamegpu/simulation/detail/CUDAFatAgentStateList.h"
+#include "flamegpu/detail/cuda.cuh"
 
 #ifdef _MSC_VER
 #pragma warning(push, 1)
@@ -18,6 +18,7 @@
 #endif
 
 namespace flamegpu {
+namespace detail {
 
 // @todo - Make _async variants of functions which launch kernels. This can be called by the non async version and immediately sync.
 
@@ -32,7 +33,7 @@ CUDAScatter::StreamData::~StreamData() {
      As this is only ever destroyed at exit time, it's not a real memory leak either.
     */
     if (d_data) {
-        gpuErrchk(flamegpu::util::detail::cuda::cudaFree(d_data));
+        gpuErrchk(flamegpu::detail::cuda::cudaFree(d_data));
     }
     d_data = nullptr;
     data_len = 0;
@@ -40,7 +41,7 @@ CUDAScatter::StreamData::~StreamData() {
 void CUDAScatter::StreamData::resize(const unsigned int newLen) {
     if (newLen > data_len) {
         if (d_data) {
-            gpuErrchk(flamegpu::util::detail::cuda::cudaFree(d_data));
+            gpuErrchk(flamegpu::detail::cuda::cudaFree(d_data));
         }
         gpuErrchk(cudaMalloc(&d_data, newLen * sizeof(ScatterData)));
         data_len = newLen;
@@ -629,4 +630,5 @@ void CUDAScatter::arrayMessageReorder(
 #endif
 }
 
+}  // namespace detail
 }  // namespace flamegpu
diff --git a/src/flamegpu/gpu/detail/CubTemporaryMemory.cu b/src/flamegpu/simulation/detail/CubTemporaryMemory.cu
similarity index 71%
rename from src/flamegpu/gpu/detail/CubTemporaryMemory.cu
rename to src/flamegpu/simulation/detail/CubTemporaryMemory.cu
index 524e6f25a..12eb56325 100644
--- a/src/flamegpu/gpu/detail/CubTemporaryMemory.cu
+++ b/src/flamegpu/simulation/detail/CubTemporaryMemory.cu
@@ -1,11 +1,11 @@
-#include "flamegpu/gpu/detail/CubTemporaryMemory.cuh"
+#include "flamegpu/simulation/detail/CubTemporaryMemory.cuh"
 #include <cuda_runtime.h>
 
 #include <cassert>
 
-#include "flamegpu/gpu/detail/CUDAErrorChecking.cuh"
+#include "flamegpu/simulation/detail/CUDAErrorChecking.cuh"
 #include "flamegpu/util/nvtx.h"
-#include "flamegpu/util/detail/cuda.cuh"
+#include "flamegpu/detail/cuda.cuh"
 
 namespace flamegpu {
 namespace detail {
@@ -16,7 +16,7 @@ CubTemporaryMemory::CubTemporaryMemory()
 CubTemporaryMemory::~CubTemporaryMemory() {
     // @todo - cuda is not allowed in destructor
     if (d_cub_temp) {
-        gpuErrchk(flamegpu::util::detail::cuda::cudaFree(d_cub_temp));
+        gpuErrchk(flamegpu::detail::cuda::cudaFree(d_cub_temp));
         d_cub_temp_size = 0;
     }
 }
@@ -24,7 +24,7 @@ void CubTemporaryMemory::resize(const size_t newSize) {
     if (newSize > d_cub_temp_size) {
         flamegpu::util::nvtx::Range range{"CubTemporaryMemory::resizeTempStorage"};
         if (d_cub_temp) {
-            gpuErrchk(flamegpu::util::detail::cuda::cudaFree(d_cub_temp));
+            gpuErrchk(flamegpu::detail::cuda::cudaFree(d_cub_temp));
         }
         gpuErrchk(cudaMalloc(&d_cub_temp, newSize));
         d_cub_temp_size = newSize;
diff --git a/src/flamegpu/runtime/utility/EnvironmentManager.cu b/src/flamegpu/simulation/detail/EnvironmentManager.cu
similarity index 95%
rename from src/flamegpu/runtime/utility/EnvironmentManager.cu
rename to src/flamegpu/simulation/detail/EnvironmentManager.cu
index e2c4a97a6..d4e460e39 100644
--- a/src/flamegpu/runtime/utility/EnvironmentManager.cu
+++ b/src/flamegpu/simulation/detail/EnvironmentManager.cu
@@ -1,15 +1,16 @@
-#include "flamegpu/runtime/utility/EnvironmentManager.cuh"
+#include "flamegpu/simulation/detail/EnvironmentManager.cuh"
 
 #include <iostream>
 
 #include "flamegpu/model/EnvironmentData.h"
 #include "flamegpu/model/SubEnvironmentData.h"
-#include "flamegpu/gpu/detail/CUDAErrorChecking.cuh"
+#include "flamegpu/simulation/detail/CUDAErrorChecking.cuh"
 #include "flamegpu/exception/FLAMEGPUException.h"
 #include "flamegpu/util/nvtx.h"
-#include "flamegpu/util/detail/cuda.cuh"
+#include "flamegpu/detail/cuda.cuh"
 
 namespace flamegpu {
+namespace detail {
 
 void EnvironmentManager::init(const EnvironmentData& desc) {
     if (properties.size()) {
@@ -135,9 +136,9 @@ void EnvironmentManager::setPropertyDirect(const std::string& property_name, con
             "in EnvironmentManager::setProperty().", property_name.c_str());
     }
 }
-util::Any EnvironmentManager::getPropertyAny(const std::string& property_name) const {
+detail::Any EnvironmentManager::getPropertyAny(const std::string& property_name) const {
     const EnvProp &prop = findProperty<void>(property_name, false, 0);
-    return util::Any(h_buffer + prop.offset, prop.length, prop.type, prop.elements);
+    return detail::Any(h_buffer + prop.offset, prop.length, prop.type, prop.elements);
 }
 
 EnvironmentManager::~EnvironmentManager() {
@@ -147,7 +148,7 @@ EnvironmentManager::~EnvironmentManager() {
         h_buffer = nullptr;
     }
     if (d_buffer) {
-        gpuErrchk(flamegpu::util::detail::cuda::cudaFree(d_buffer));
+        gpuErrchk(flamegpu::detail::cuda::cudaFree(d_buffer));
         d_buffer = nullptr;
     }
     h_buffer_len = 0;
@@ -170,4 +171,5 @@ void EnvironmentManager::updateDevice_async(const cudaStream_t stream) const {
         d_buffer_ready = true;
     }
 }
+}  // namespace detail
 }  // namespace flamegpu
diff --git a/src/flamegpu/runtime/utility/RandomManager.cu b/src/flamegpu/simulation/detail/RandomManager.cu
similarity index 78%
rename from src/flamegpu/runtime/utility/RandomManager.cu
rename to src/flamegpu/simulation/detail/RandomManager.cu
index 5ec912699..44e2151c3 100644
--- a/src/flamegpu/runtime/utility/RandomManager.cu
+++ b/src/flamegpu/simulation/detail/RandomManager.cu
@@ -1,4 +1,4 @@
-#include "flamegpu/runtime/utility/RandomManager.cuh"
+#include "flamegpu/simulation/detail/RandomManager.cuh"
 
 #include <cuda_runtime.h>
 #include <device_launch_parameters.h>
@@ -9,12 +9,13 @@
 #include <cstdio>
 #include <algorithm>
 
-#include "flamegpu/util/detail/curand.cuh"
-#include "flamegpu/gpu/detail/CUDAErrorChecking.cuh"
-#include "flamegpu/gpu/CUDASimulation.h"
-#include "flamegpu/util/detail/cuda.cuh"
+#include "flamegpu/detail/curand.cuh"
+#include "flamegpu/simulation/detail/CUDAErrorChecking.cuh"
+#include "flamegpu/simulation/CUDASimulation.h"
+#include "flamegpu/detail/cuda.cuh"
 
 namespace flamegpu {
+namespace detail {
 
 RandomManager::RandomManager() :
     deviceInitialised(false) {
@@ -70,7 +71,7 @@ void RandomManager::freeDevice() {
         length = 0;
         // Release old random states on the deivce and update pointers.
         if (d_random_state) {
-            gpuErrchk(flamegpu::util::detail::cuda::cudaFree(d_random_state));
+            gpuErrchk(flamegpu::detail::cuda::cudaFree(d_random_state));
         }
         d_random_state = nullptr;
     }
@@ -82,7 +83,7 @@ void RandomManager::free() {
     freeDevice();
 }
 
-util::detail::curandState *RandomManager::resize(size_type _length, cudaStream_t stream) {
+detail::curandState *RandomManager::resize(size_type _length, cudaStream_t stream) {
     assert(growthModifier > 1.0);
     assert(shrinkModifier > 0.0);
     assert(shrinkModifier <= 1.0);
@@ -105,7 +106,7 @@ util::detail::curandState *RandomManager::resize(size_type _length, cudaStream_t
         resizeDeviceArray(t_length, stream);
     return d_random_state;
 }
-__global__ void init_curand(util::detail::curandState *d_random_state, unsigned int threadCount, uint64_t seed, flamegpu::size_type offset) {
+__global__ void init_curand(detail::curandState *d_random_state, unsigned int threadCount, uint64_t seed, flamegpu::size_type offset) {
     int id = blockIdx.x * blockDim.x + threadIdx.x;
     if (id < threadCount)
         curand_init(seed, offset + id, 0, &d_random_state[offset + id]);
@@ -115,16 +116,16 @@ void RandomManager::resizeDeviceArray(const size_type _length, cudaStream_t stre
     deviceInitialised = true;
     if (_length > h_max_random_size) {
         // Growing array
-        util::detail::curandState *t_hd_random_state = nullptr;
+        detail::curandState *t_hd_random_state = nullptr;
         // Allocate new mem to t_hd
-        gpuErrchk(cudaMalloc(&t_hd_random_state, _length * sizeof(util::detail::curandState)));
+        gpuErrchk(cudaMalloc(&t_hd_random_state, _length * sizeof(detail::curandState)));
         // Copy hd->t_hd[****    ]
         if (d_random_state) {
-            gpuErrchk(cudaMemcpyAsync(t_hd_random_state, d_random_state, length * sizeof(util::detail::curandState), cudaMemcpyDeviceToDevice, stream));
+            gpuErrchk(cudaMemcpyAsync(t_hd_random_state, d_random_state, length * sizeof(detail::curandState), cudaMemcpyDeviceToDevice, stream));
         }
         // Update pointers hd=t_hd
         if (d_random_state) {
-            gpuErrchk(flamegpu::util::detail::cuda::cudaFree(d_random_state));
+            gpuErrchk(flamegpu::detail::cuda::cudaFree(d_random_state));
         }
         d_random_state = t_hd_random_state;
         // Init new[    ****]
@@ -132,7 +133,7 @@ void RandomManager::resizeDeviceArray(const size_type _length, cudaStream_t stre
             // We have part/all host backup, copy to device array
             // Reinit backup[    **  ]
             const size_type copy_len = std::min(h_max_random_size, _length);
-            gpuErrchk(cudaMemcpyAsync(d_random_state + length, h_max_random_state + length, copy_len * sizeof(util::detail::curandState), cudaMemcpyHostToDevice, stream));  // Host not pinned
+            gpuErrchk(cudaMemcpyAsync(d_random_state + length, h_max_random_state + length, copy_len * sizeof(detail::curandState), cudaMemcpyHostToDevice, stream));  // Host not pinned
             length += copy_len;
         }
         if (_length > length) {
@@ -144,20 +145,20 @@ void RandomManager::resizeDeviceArray(const size_type _length, cudaStream_t stre
         }
     } else {
         // Shrinking array
-        util::detail::curandState *t_hd_random_state = nullptr;
-        util::detail::curandState *t_h_max_random_state = nullptr;
+        detail::curandState *t_hd_random_state = nullptr;
+        detail::curandState *t_h_max_random_state = nullptr;
         // Allocate new
-        gpuErrchk(cudaMalloc(&t_hd_random_state, _length * sizeof(util::detail::curandState)));
+        gpuErrchk(cudaMalloc(&t_hd_random_state, _length * sizeof(detail::curandState)));
         // Allocate host backup
         if (length > h_max_random_size)
-            t_h_max_random_state = reinterpret_cast<util::detail::curandState*>(malloc(length * sizeof(util::detail::curandState)));
+            t_h_max_random_state = reinterpret_cast<detail::curandState*>(malloc(length * sizeof(detail::curandState)));
         else
             t_h_max_random_state = h_max_random_state;
         // Copy old->new
         assert(d_random_state);
-        gpuErrchk(cudaMemcpyAsync(t_hd_random_state, d_random_state, _length * sizeof(util::detail::curandState), cudaMemcpyDeviceToDevice, stream));
+        gpuErrchk(cudaMemcpyAsync(t_hd_random_state, d_random_state, _length * sizeof(detail::curandState), cudaMemcpyDeviceToDevice, stream));
         // Copy part being shrunk away to host storage (This could be async with above memcpy?)
-        gpuErrchk(cudaMemcpyAsync(t_h_max_random_state + _length, d_random_state + _length, (length - _length) * sizeof(util::detail::curandState), cudaMemcpyDeviceToHost, stream));
+        gpuErrchk(cudaMemcpyAsync(t_h_max_random_state + _length, d_random_state + _length, (length - _length) * sizeof(detail::curandState), cudaMemcpyDeviceToHost, stream));
         // Release and replace old host ptr
         if (length > h_max_random_size) {
             if (h_max_random_state)
@@ -167,7 +168,7 @@ void RandomManager::resizeDeviceArray(const size_type _length, cudaStream_t stre
         }
         // Release old
         if (d_random_state != nullptr) {
-            gpuErrchk(flamegpu::util::detail::cuda::cudaFree(d_random_state));
+            gpuErrchk(flamegpu::detail::cuda::cudaFree(d_random_state));
         }
         // Update pointer
         d_random_state = t_hd_random_state;
@@ -197,8 +198,9 @@ flamegpu::size_type RandomManager::size() {
 uint64_t RandomManager::seed() {
     return mSeed;
 }
-util::detail::curandState *RandomManager::cudaRandomState() {
+detail::curandState *RandomManager::cudaRandomState() {
     return d_random_state;
 }
 
+}  // namespace detail
 }  // namespace flamegpu
diff --git a/src/flamegpu/sim/SimLogger.cu b/src/flamegpu/simulation/detail/SimLogger.cu
similarity index 96%
rename from src/flamegpu/sim/SimLogger.cu
rename to src/flamegpu/simulation/detail/SimLogger.cu
index 6a86c1616..099386d07 100644
--- a/src/flamegpu/sim/SimLogger.cu
+++ b/src/flamegpu/simulation/detail/SimLogger.cu
@@ -1,9 +1,9 @@
-#include "flamegpu/sim/SimLogger.h"
+#include "flamegpu/simulation/detail/SimLogger.h"
 
 #include <filesystem>
 
 #include "flamegpu/io/LoggerFactory.h"
-#include "flamegpu/sim/RunPlanVector.h"
+#include "flamegpu/simulation/RunPlanVector.h"
 
 #ifdef _MSC_VER
 #include <windows.h>
@@ -12,6 +12,7 @@
 #endif
 
 namespace flamegpu {
+namespace detail {
 
 SimLogger::SimLogger(const std::vector<RunLog> &_run_logs,
         const RunPlanVector &_run_plans,
@@ -90,4 +91,5 @@ void SimLogger::start() {
     }
 }
 
+}  // namespace detail
 }  // namespace flamegpu
diff --git a/src/flamegpu/sim/SimRunner.cu b/src/flamegpu/simulation/detail/SimRunner.cu
similarity index 96%
rename from src/flamegpu/sim/SimRunner.cu
rename to src/flamegpu/simulation/detail/SimRunner.cu
index f5b56ea06..a0846eab9 100644
--- a/src/flamegpu/sim/SimRunner.cu
+++ b/src/flamegpu/simulation/detail/SimRunner.cu
@@ -1,10 +1,10 @@
-#include "flamegpu/sim/SimRunner.h"
+#include "flamegpu/simulation/detail/SimRunner.h"
 
 #include <utility>
 
 #include "flamegpu/model/ModelData.h"
-#include "flamegpu/gpu/CUDASimulation.h"
-#include "flamegpu/sim/RunPlanVector.h"
+#include "flamegpu/simulation/CUDASimulation.h"
+#include "flamegpu/simulation/RunPlanVector.h"
 
 #ifdef _MSC_VER
 #include <windows.h>
@@ -13,6 +13,7 @@
 #endif
 
 namespace flamegpu {
+namespace detail {
 
 SimRunner::SimRunner(const std::shared_ptr<const ModelData> _model,
     std::atomic<unsigned int> &_err_ct,
@@ -140,4 +141,5 @@ void SimRunner::start() {
     }
 }
 
+}  // namespace detail
 }  // namespace flamegpu
diff --git a/src/flamegpu/util/cleanup.cu b/src/flamegpu/util/cleanup.cu
index 8327d0336..8adc85105 100644
--- a/src/flamegpu/util/cleanup.cu
+++ b/src/flamegpu/util/cleanup.cu
@@ -1,8 +1,8 @@
 #include "flamegpu/util/cleanup.h"
 
 #include <cuda_runtime.h>
-#include "flamegpu/gpu/detail/CUDAErrorChecking.cuh"
-#include "flamegpu/util/detail/JitifyCache.h"
+#include "flamegpu/simulation/detail/CUDAErrorChecking.cuh"
+#include "flamegpu/detail/JitifyCache.h"
 
 namespace flamegpu {
 namespace util {
diff --git a/src/flamegpu/visualiser/AgentVis.cpp b/src/flamegpu/visualiser/AgentVis.cpp
index 1facf9bdd..1d455a30b 100644
--- a/src/flamegpu/visualiser/AgentVis.cpp
+++ b/src/flamegpu/visualiser/AgentVis.cpp
@@ -2,7 +2,7 @@
 
 #include "flamegpu/visualiser/AgentVis.h"
 
-#include "flamegpu/gpu/CUDAAgent.h"
+#include "flamegpu/simulation/detail/CUDAAgent.h"
 #include "flamegpu/model/AgentData.h"
 #include "flamegpu/model/AgentDescription.h"
 #include "flamegpu/visualiser/color/ColorFunction.h"
@@ -13,7 +13,7 @@
 namespace flamegpu {
 namespace visualiser {
 
-AgentVisData::AgentVisData(CUDAAgent &_agent, const std::shared_ptr<AutoPalette>& autopalette)
+AgentVisData::AgentVisData(detail::CUDAAgent &_agent, const std::shared_ptr<AutoPalette>& autopalette)
     : owned_auto_palette(nullptr)
     , agent(_agent)
     , agentData(std::const_pointer_cast<const AgentData>(_agent.getAgentDescription().agent)) {
diff --git a/src/flamegpu/visualiser/ModelVis.cpp b/src/flamegpu/visualiser/ModelVis.cpp
index d7ccd5c50..f3faaff99 100644
--- a/src/flamegpu/visualiser/ModelVis.cpp
+++ b/src/flamegpu/visualiser/ModelVis.cpp
@@ -5,7 +5,7 @@
 #include <thread>
 #include <utility>
 
-#include "flamegpu/gpu/CUDASimulation.h"
+#include "flamegpu/simulation/CUDASimulation.h"
 #include "flamegpu/model/AgentData.h"
 #include "flamegpu/visualiser/FLAMEGPU_Visualisation.h"
 
diff --git a/swig/python/flamegpu.i b/swig/python/flamegpu.i
index 068af868a..0a0b6fc22 100644
--- a/swig/python/flamegpu.i
+++ b/swig/python/flamegpu.i
@@ -570,7 +570,7 @@ class ModelVis;
 // Include Telemetry functions
 %include "flamegpu/io/Telemetry.h"
 
-%include "flamegpu/sim/AgentInterface.h"
+%include "flamegpu/simulation/detail/AgentInterface.h"
 
 %include "flamegpu/runtime/HostFunctionCallback.h"
 
@@ -606,48 +606,48 @@ class ModelVis;
 %include "flamegpu/model/SubAgentDescription.h"
 %include "flamegpu/model/SubEnvironmentDescription.h"
 
-%include "flamegpu/runtime/utility/RandomManager.cuh"
+%include "flamegpu/simulation/detail/RandomManager.cuh"
 
 // Include Simulation and CUDASimulation
 %feature("flatnested");     // flat nested on to ensure Config is included
-%include "flamegpu/sim/Simulation.h"
-%include "flamegpu/gpu/CUDASimulation.h"
+%include "flamegpu/simulation/Simulation.h"
+%include "flamegpu/simulation/CUDASimulation.h"
 %feature("flatnested", ""); // flat nested off
 
 %feature("flatnested");     // flat nested on to ensure Config is included
-%include "flamegpu/gpu/CUDAEnsemble.h"
+%include "flamegpu/simulation/CUDAEnsemble.h"
 %feature("flatnested", ""); // flat nested off
 
 %include "flamegpu/runtime/AgentFunction_shim.cuh"
 %include "flamegpu/runtime/AgentFunctionCondition_shim.cuh"
 
 // These are essentially nested classes that have been split out. 
-%include "flamegpu/pop/AgentVector_Agent.h"
-%include "flamegpu/pop/AgentVector.h"
-%include "flamegpu/pop/AgentInstance.h"
-%include "flamegpu/pop/DeviceAgentVector_impl.h"
-%include "flamegpu/pop/DeviceAgentVector.h"
+%include "flamegpu/simulation/AgentVector_Agent.h"
+%include "flamegpu/simulation/AgentVector.h"
+%include "flamegpu/runtime/agent/AgentInstance.h"
+%include "flamegpu/runtime/agent/DeviceAgentVector_impl.h"
+%include "flamegpu/runtime/agent/DeviceAgentVector.h"
 
 // Must wrap these prior to HostAPI where they are used to avoid issues with no default constructors etc.
-%include "flamegpu/runtime/utility/HostRandom.cuh"
+%include "flamegpu/runtime/random/HostRandom.cuh"
 
 %nodefaultctor flamegpu::HostMacroProperty_swig;
-%include "flamegpu/runtime/utility/HostMacroProperty.cuh"
-%include "flamegpu/runtime/utility/HostEnvironment.cuh"
+%include "flamegpu/runtime/environment/HostMacroProperty.cuh"
+%include "flamegpu/runtime/environment/HostEnvironment.cuh"
 
-%include "flamegpu/runtime/HostNewAgentAPI.h"
-%include "flamegpu/runtime/HostAgentAPI.cuh"
+%include "flamegpu/runtime/agent/HostNewAgentAPI.h"
+%include "flamegpu/runtime/agent/HostAgentAPI.cuh"
 %include "flamegpu/runtime/HostAPI.h" 
 
 // Include logging implementations
-%include "flamegpu/sim/LoggingConfig.h"
-%include "flamegpu/sim/AgentLoggingConfig.h"
-%include "flamegpu/sim/AgentLoggingConfig_SumReturn.h"
-%include "flamegpu/sim/LogFrame.h"  // Includes RunLog. 
+%include "flamegpu/simulation/LoggingConfig.h"
+%include "flamegpu/simulation/AgentLoggingConfig.h"
+%include "flamegpu/simulation/AgentLoggingConfig_SumReturn.h"
+%include "flamegpu/simulation/LogFrame.h"  // Includes RunLog. 
 
 // Include ensemble implementations
-%include "flamegpu/sim/RunPlan.h"
-%include "flamegpu/sim/RunPlanVector.h"
+%include "flamegpu/simulation/RunPlan.h"
+%include "flamegpu/simulation/RunPlanVector.h"
 
 // Include public utility headers
 %include "flamegpu/util/cleanup.h"
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 2453d46cf..63d721bfb 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -12,14 +12,17 @@ endif()
 # Define the source files early, prior to projects.
 # Prepare source files for the tests target
 SET(TESTS_SRC
-    # ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/runtime/test_func_pointer.cu # Does not currently build
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/detail/test_compute_capability.cu
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/detail/test_cuda.cu
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/detail/test_wddm.cu
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/detail/test_dependency_versions.cu
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/detail/test_multi_thread_device.cu
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/detail/test_CUDAEventTimer.cu
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/detail/test_SteadyClockTimer.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/detail/test_cxxname.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/detail/test_rtc_multi_thread_device.cu
     ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/exception/test_device_exception.cu
-    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/gpu/test_cuda_simulation.cu
-    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/gpu/test_cuda_simulation_concurrency.cu
-    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/gpu/test_cuda_ensemble.cu
-    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/gpu/test_gpu_validation.cu
-    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/gpu/test_cuda_subagent.cu
-    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/gpu/test_cuda_submacroenvironment.cu
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/exception/test_rtc_device_exception.cu
     ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/io/test_io.cu
     ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/io/test_logging.cu
     ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/io/test_logging_exceptions.cu
@@ -33,38 +36,44 @@ SET(TESTS_SRC
     ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/model/test_layer.cu
     ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/model/test_subagent.cu
     ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/model/test_subenvironment.cu
-    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/pop/test_agent_vector.cu
-    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/pop/test_agent_instance.cu
-    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/pop/test_device_agent_vector.cu
-    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/sim/test_host_functions.cu
-    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/sim/test_RunPlan.cu
-    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/sim/test_RunPlanVector.cu
-    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/runtime/test_device_environment.cu
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/simulation/test_cuda_simulation.cu
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/simulation/test_cuda_simulation_concurrency.cu
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/simulation/test_cuda_ensemble.cu
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/simulation/test_gpu_validation.cu
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/simulation/detail/test_cuda_subagent.cu
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/simulation/detail/test_cuda_submacroenvironment.cu
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/simulation/test_agent_vector.cu
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/simulation/test_agent_instance.cu
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/simulation/test_host_functions.cu
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/simulation/test_RunPlan.cu
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/simulation/test_RunPlanVector.cu
     ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/runtime/test_agent_function_conditions.cu
-    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/runtime/test_agent_random.cu
-    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/runtime/test_agent_state_transition.cu
-    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/runtime/test_device_agent_creation.cu
     ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/runtime/test_device_api.cu
-    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/runtime/test_device_environment.cu
-    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/runtime/test_device_macro_property.cu
-    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/runtime/test_environment_manager.cu
     ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/runtime/test_host_api.cu
-    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/runtime/test_host_agent_sort.cu
-    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/runtime/test_host_agent_creation.cu
-    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/runtime/test_host_environment.cu
-    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/runtime/test_host_macro_property.cu
-    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/runtime/test_host_random.cu
-    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/runtime/test_spatial_agent_sort.cu
-    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/runtime/host_reduction/test_min.cu
-    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/runtime/host_reduction/test_max.cu
-    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/runtime/host_reduction/test_sum.cu
-    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/runtime/host_reduction/test_reduce.cu
-    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/runtime/host_reduction/test_count.cu
-    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/runtime/host_reduction/test_transform_reduce.cu
-    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/runtime/host_reduction/test_histogram_even.cu
-    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/runtime/host_reduction/test_mean_standarddeviation.cu
-    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/runtime/host_reduction/test_misc.cu
-    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/runtime/test_subenvironment_manager.cu
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/runtime/test_rtc_device_api.cu
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/runtime/environment/test_host_environment.cu
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/runtime/environment/test_host_macro_property.cu
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/runtime/environment/test_subenvironment_manager.cu
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/runtime/environment/test_device_environment.cu
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/runtime/environment/test_device_macro_property.cu
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/runtime/environment/test_environment_manager.cu
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/runtime/random/test_agent_random.cu
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/runtime/random/test_host_random.cu
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/runtime/agent/test_device_agent_vector.cu
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/runtime/agent/test_host_agent_sort.cu
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/runtime/agent/test_host_agent_creation.cu
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/runtime/agent/test_device_agent_creation.cu
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/runtime/agent/detail/test_spatial_agent_sort.cu
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/runtime/agent/detail/test_agent_state_transition.cu
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/runtime/agent/host_reduction/test_min.cu
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/runtime/agent/host_reduction/test_max.cu
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/runtime/agent/host_reduction/test_sum.cu
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/runtime/agent/host_reduction/test_reduce.cu
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/runtime/agent/host_reduction/test_count.cu
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/runtime/agent/host_reduction/test_transform_reduce.cu
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/runtime/agent/host_reduction/test_histogram_even.cu
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/runtime/agent/host_reduction/test_mean_standarddeviation.cu
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/runtime/agent/host_reduction/test_misc.cu
     ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/runtime/messaging/test_messaging.cu
     ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/runtime/messaging/test_spatial_2d.cu
     ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/runtime/messaging/test_spatial_3d.cu
@@ -75,18 +84,7 @@ SET(TESTS_SRC
     ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/runtime/messaging/test_bucket.cu
     ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/runtime/messaging/test_append_truncate.cu
     ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/util/test_cleanup.cu
-    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/util/test_compute_capability.cu
-    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/util/test_cuda.cu
-    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/util/test_wddm.cu
     ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/util/test_nvtx.cu
-    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/util/test_dependency_versions.cu
-    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/util/test_multi_thread_device.cu
-    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/util/test_CUDAEventTimer.cu
-    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/util/test_SteadyClockTimer.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/util/test_cxxname.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/runtime/test_rtc_device_api.cu
-    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/util/test_rtc_multi_thread_device.cu
-    ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/exception/test_rtc_device_exception.cu
     ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/test_namespaces/test_namespaces.cu
     ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/test_namespaces/test_rtc_namespaces.cu
     ${CMAKE_CURRENT_SOURCE_DIR}/test_cases/test_version.cpp
diff --git a/tests/helpers/main.cu b/tests/helpers/main.cu
index 65440633f..5d648fe9e 100644
--- a/tests/helpers/main.cu
+++ b/tests/helpers/main.cu
@@ -2,7 +2,7 @@
 #include <cstdio>
 #include <map>
 
-#include "flamegpu/gpu/CUDASimulation.h"
+#include "flamegpu/simulation/CUDASimulation.h"
 #include "gtest/gtest.h"
 #include "helpers/device_initialisation.h"
 #include "flamegpu/io/Telemetry.h"
diff --git a/tests/swig/python/runtime/test_agent_state_transition.py b/tests/swig/python/runtime/agent/detail/test_agent_state_transition.py
similarity index 100%
rename from tests/swig/python/runtime/test_agent_state_transition.py
rename to tests/swig/python/runtime/agent/detail/test_agent_state_transition.py
diff --git a/tests/swig/python/runtime/host_reduction/test_count.py b/tests/swig/python/runtime/agent/host_reduction/test_count.py
similarity index 100%
rename from tests/swig/python/runtime/host_reduction/test_count.py
rename to tests/swig/python/runtime/agent/host_reduction/test_count.py
diff --git a/tests/swig/python/runtime/host_reduction/test_max.py b/tests/swig/python/runtime/agent/host_reduction/test_max.py
similarity index 100%
rename from tests/swig/python/runtime/host_reduction/test_max.py
rename to tests/swig/python/runtime/agent/host_reduction/test_max.py
diff --git a/tests/swig/python/runtime/host_reduction/test_mean_standarddev.py b/tests/swig/python/runtime/agent/host_reduction/test_mean_standarddev.py
similarity index 100%
rename from tests/swig/python/runtime/host_reduction/test_mean_standarddev.py
rename to tests/swig/python/runtime/agent/host_reduction/test_mean_standarddev.py
diff --git a/tests/swig/python/runtime/host_reduction/test_min.py b/tests/swig/python/runtime/agent/host_reduction/test_min.py
similarity index 100%
rename from tests/swig/python/runtime/host_reduction/test_min.py
rename to tests/swig/python/runtime/agent/host_reduction/test_min.py
diff --git a/tests/swig/python/runtime/host_reduction/test_sum.py b/tests/swig/python/runtime/agent/host_reduction/test_sum.py
similarity index 100%
rename from tests/swig/python/runtime/host_reduction/test_sum.py
rename to tests/swig/python/runtime/agent/host_reduction/test_sum.py
diff --git a/tests/swig/python/runtime/test_device_agent_creation.py b/tests/swig/python/runtime/agent/test_device_agent_creation.py
similarity index 100%
rename from tests/swig/python/runtime/test_device_agent_creation.py
rename to tests/swig/python/runtime/agent/test_device_agent_creation.py
diff --git a/tests/swig/python/pop/test_device_agent_vector.py b/tests/swig/python/runtime/agent/test_device_agent_vector.py
similarity index 100%
rename from tests/swig/python/pop/test_device_agent_vector.py
rename to tests/swig/python/runtime/agent/test_device_agent_vector.py
diff --git a/tests/swig/python/runtime/test_host_agent_creation.py b/tests/swig/python/runtime/agent/test_host_agent_creation.py
similarity index 100%
rename from tests/swig/python/runtime/test_host_agent_creation.py
rename to tests/swig/python/runtime/agent/test_host_agent_creation.py
diff --git a/tests/swig/python/runtime/test_host_agent_sort.py b/tests/swig/python/runtime/agent/test_host_agent_sort.py
similarity index 100%
rename from tests/swig/python/runtime/test_host_agent_sort.py
rename to tests/swig/python/runtime/agent/test_host_agent_sort.py
diff --git a/tests/swig/python/runtime/test_agent_environment.py b/tests/swig/python/runtime/environment/test_agent_environment.py
similarity index 100%
rename from tests/swig/python/runtime/test_agent_environment.py
rename to tests/swig/python/runtime/environment/test_agent_environment.py
diff --git a/tests/swig/python/runtime/test_device_macro_property.py b/tests/swig/python/runtime/environment/test_device_macro_property.py
similarity index 100%
rename from tests/swig/python/runtime/test_device_macro_property.py
rename to tests/swig/python/runtime/environment/test_device_macro_property.py
diff --git a/tests/swig/python/runtime/test_environment_manager.py b/tests/swig/python/runtime/environment/test_environment_manager.py
similarity index 100%
rename from tests/swig/python/runtime/test_environment_manager.py
rename to tests/swig/python/runtime/environment/test_environment_manager.py
diff --git a/tests/swig/python/runtime/test_host_environment.py b/tests/swig/python/runtime/environment/test_host_environment.py
similarity index 100%
rename from tests/swig/python/runtime/test_host_environment.py
rename to tests/swig/python/runtime/environment/test_host_environment.py
diff --git a/tests/swig/python/runtime/test_host_macro_property.py b/tests/swig/python/runtime/environment/test_host_macro_property.py
similarity index 100%
rename from tests/swig/python/runtime/test_host_macro_property.py
rename to tests/swig/python/runtime/environment/test_host_macro_property.py
diff --git a/tests/swig/python/runtime/test_agent_random.py b/tests/swig/python/runtime/random/test_agent_random.py
similarity index 100%
rename from tests/swig/python/runtime/test_agent_random.py
rename to tests/swig/python/runtime/random/test_agent_random.py
diff --git a/tests/swig/python/runtime/test_host_random.py b/tests/swig/python/runtime/random/test_host_random.py
similarity index 100%
rename from tests/swig/python/runtime/test_host_random.py
rename to tests/swig/python/runtime/random/test_host_random.py
diff --git a/tests/swig/python/gpu/test_cuda_subagent.py b/tests/swig/python/simulation/detail/test_cuda_subagent.py
similarity index 100%
rename from tests/swig/python/gpu/test_cuda_subagent.py
rename to tests/swig/python/simulation/detail/test_cuda_subagent.py
diff --git a/tests/swig/python/gpu/test_cuda_submacroenvironment.py b/tests/swig/python/simulation/detail/test_cuda_submacroenvironment.py
similarity index 100%
rename from tests/swig/python/gpu/test_cuda_submacroenvironment.py
rename to tests/swig/python/simulation/detail/test_cuda_submacroenvironment.py
diff --git a/tests/swig/python/sim/test_RunPlan.py b/tests/swig/python/simulation/test_RunPlan.py
similarity index 100%
rename from tests/swig/python/sim/test_RunPlan.py
rename to tests/swig/python/simulation/test_RunPlan.py
diff --git a/tests/swig/python/sim/test_RunPlanVector.py b/tests/swig/python/simulation/test_RunPlanVector.py
similarity index 100%
rename from tests/swig/python/sim/test_RunPlanVector.py
rename to tests/swig/python/simulation/test_RunPlanVector.py
diff --git a/tests/swig/python/pop/test_agent_instance.py b/tests/swig/python/simulation/test_agent_instance.py
similarity index 100%
rename from tests/swig/python/pop/test_agent_instance.py
rename to tests/swig/python/simulation/test_agent_instance.py
diff --git a/tests/swig/python/pop/test_agent_vector.py b/tests/swig/python/simulation/test_agent_vector.py
similarity index 100%
rename from tests/swig/python/pop/test_agent_vector.py
rename to tests/swig/python/simulation/test_agent_vector.py
diff --git a/tests/swig/python/gpu/test_cuda_ensemble.py b/tests/swig/python/simulation/test_cuda_ensemble.py
similarity index 100%
rename from tests/swig/python/gpu/test_cuda_ensemble.py
rename to tests/swig/python/simulation/test_cuda_ensemble.py
diff --git a/tests/swig/python/gpu/test_cuda_simulation.py b/tests/swig/python/simulation/test_cuda_simulation.py
similarity index 100%
rename from tests/swig/python/gpu/test_cuda_simulation.py
rename to tests/swig/python/simulation/test_cuda_simulation.py
diff --git a/tests/swig/python/gpu/test_gpu_validation.py b/tests/swig/python/simulation/test_gpu_validation.py
similarity index 100%
rename from tests/swig/python/gpu/test_gpu_validation.py
rename to tests/swig/python/simulation/test_gpu_validation.py
diff --git a/tests/test_cases/util/test_CUDAEventTimer.cu b/tests/test_cases/detail/test_CUDAEventTimer.cu
similarity index 88%
rename from tests/test_cases/util/test_CUDAEventTimer.cu
rename to tests/test_cases/detail/test_CUDAEventTimer.cu
index 87c2c461d..a4e32f59a 100644
--- a/tests/test_cases/util/test_CUDAEventTimer.cu
+++ b/tests/test_cases/detail/test_CUDAEventTimer.cu
@@ -1,8 +1,8 @@
 #include <thread>
 #include <chrono>
-#include "flamegpu/util/detail/CUDAEventTimer.cuh"
-#include "flamegpu/util/detail/wddm.cuh"
-#include "flamegpu/gpu/detail/CUDAErrorChecking.cuh"
+#include "flamegpu/detail/CUDAEventTimer.cuh"
+#include "flamegpu/detail/wddm.cuh"
+#include "flamegpu/simulation/detail/CUDAErrorChecking.cuh"
 
 #include "gtest/gtest.h"
 namespace flamegpu {
@@ -20,8 +20,8 @@ namespace test_CUDAEventTimer {
  */
 TEST(TestUtilCUDAEventTimer, CUDAEventTimer) {
     // Create an event timer, time should be 0 initially.
-    util::detail::Timer * timer = nullptr;
-    EXPECT_NO_THROW(timer = new util::detail::CUDAEventTimer());
+    detail::Timer * timer = nullptr;
+    EXPECT_NO_THROW(timer = new detail::CUDAEventTimer());
     // Expect an exception if sync is called via getElapsed* if start() has not yet been called.
     EXPECT_THROW(timer->getElapsedMilliseconds(), exception::TimerException);
     // Time an arbitrary event, and check the value is approximately correct.
@@ -33,7 +33,7 @@ TEST(TestUtilCUDAEventTimer, CUDAEventTimer) {
     const double min_expected_millis = min_expected_seconds * 1000.0;
     // If the WDDM driver is being used, this test is only accurate if the  start event is synchronised (pushed to the device) prior to the sleep.
     // Essentially, CUDAEventTimers should not be used to time host code, they are only accurate for  the device code which they wrap.
-    if (util::detail::wddm::deviceIsWDDM()) {
+    if (detail::wddm::deviceIsWDDM()) {
         gpuErrchk(cudaDeviceSynchronize());
     }
     // Sleep for some amount of time.
diff --git a/tests/test_cases/util/test_SteadyClockTimer.cpp b/tests/test_cases/detail/test_SteadyClockTimer.cpp
similarity index 85%
rename from tests/test_cases/util/test_SteadyClockTimer.cpp
rename to tests/test_cases/detail/test_SteadyClockTimer.cpp
index aa824b299..98010aa27 100644
--- a/tests/test_cases/util/test_SteadyClockTimer.cpp
+++ b/tests/test_cases/detail/test_SteadyClockTimer.cpp
@@ -1,8 +1,8 @@
 #include <thread>
 #include <chrono>
 #include <ratio>
-#include "flamegpu/util/detail/Timer.h"
-#include "flamegpu/util/detail/SteadyClockTimer.h"
+#include "flamegpu/detail/Timer.h"
+#include "flamegpu/detail/SteadyClockTimer.h"
 
 #include "gtest/gtest.h"
 namespace flamegpu {
@@ -10,8 +10,8 @@ namespace flamegpu {
 
 TEST(TestSteadyClockTimer, SteadyClockTimer) {
     // Create a steady clock timer via the base class
-    util::detail::Timer * timer = nullptr;
-    EXPECT_NO_THROW(timer = new util::detail::SteadyClockTimer());
+    detail::Timer * timer = nullptr;
+    EXPECT_NO_THROW(timer = new detail::SteadyClockTimer());
     // Expect an exception if sync is called via getElapsed* if start() has not yet been called.
     EXPECT_THROW(timer->getElapsedMilliseconds(), exception::TimerException);
     // Time an arbitrary event, and check the value is approximately correct.
diff --git a/tests/test_cases/util/test_compute_capability.cu b/tests/test_cases/detail/test_compute_capability.cu
similarity index 61%
rename from tests/test_cases/util/test_compute_capability.cu
rename to tests/test_cases/detail/test_compute_capability.cu
index c28db23ba..b84744721 100644
--- a/tests/test_cases/util/test_compute_capability.cu
+++ b/tests/test_cases/detail/test_compute_capability.cu
@@ -1,8 +1,8 @@
 #include <cuda_runtime.h>
 
 #include <vector>
-#include "flamegpu/util/detail/compute_capability.cuh"
-#include "flamegpu/gpu/detail/CUDAErrorChecking.cuh"
+#include "flamegpu/detail/compute_capability.cuh"
+#include "flamegpu/simulation/detail/CUDAErrorChecking.cuh"
 
 #include "gtest/gtest.h"
 namespace flamegpu {
@@ -24,21 +24,21 @@ TEST(TestUtilComputeCapability, getComputeCapability) {
         gpuErrchk(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, i));
         int reference = (10 * major) + minor;
         // The function should return the reference value.
-        EXPECT_EQ(util::detail::compute_capability::getComputeCapability(i), reference);
+        EXPECT_EQ(detail::compute_capability::getComputeCapability(i), reference);
     }
 
     // If the function is given a bad index, it should throw.
-    EXPECT_ANY_THROW(util::detail::compute_capability::getComputeCapability(-1));
-    EXPECT_ANY_THROW(util::detail::compute_capability::getComputeCapability(device_count));
+    EXPECT_ANY_THROW(detail::compute_capability::getComputeCapability(-1));
+    EXPECT_ANY_THROW(detail::compute_capability::getComputeCapability(device_count));
 }
 
 // Test getting the minimum compiled cuda capabillity.
 TEST(TestUtilComputeCapability, minimumCompiledComputeCapability) {
     // If the macro is defined, the returned value should match, otherwise it should be 0.
     #if defined(FLAMEGPU_TEST_MIN_CUDA_ARCH)
-        EXPECT_EQ(util::detail::compute_capability::minimumCompiledComputeCapability(), FLAMEGPU_TEST_MIN_CUDA_ARCH);
+        EXPECT_EQ(detail::compute_capability::minimumCompiledComputeCapability(), FLAMEGPU_TEST_MIN_CUDA_ARCH);
     #else
-        EXPECT_EQ(util::detail::compute_capability::minimumCompiledComputeCapability(), 0);
+        EXPECT_EQ(detail::compute_capability::minimumCompiledComputeCapability(), 0);
     #endif
 }
 
@@ -50,17 +50,17 @@ TEST(TestUtilComputeCapability, checkComputeCapability) {
         return;
     }
     // Get the minimum cc compiled for, previously tested.
-    int min_cc = util::detail::compute_capability::minimumCompiledComputeCapability();
+    int min_cc = detail::compute_capability::minimumCompiledComputeCapability();
     // For each CUDA device, get the compute capability and comapre it against
     for (int i = 0; i < device_count; i++) {
         // This function is tested elsewhere, so use it here.
-        int cc = util::detail::compute_capability::getComputeCapability(i);
-        EXPECT_EQ(util::detail::compute_capability::checkComputeCapability(i), cc >= min_cc);
+        int cc = detail::compute_capability::getComputeCapability(i);
+        EXPECT_EQ(detail::compute_capability::checkComputeCapability(i), cc >= min_cc);
     }
 
     // If the function is given a bad index, it should throw and the result is irrelevant.
-    EXPECT_ANY_THROW(util::detail::compute_capability::checkComputeCapability(-1));
-    EXPECT_ANY_THROW(util::detail::compute_capability::checkComputeCapability(device_count));
+    EXPECT_ANY_THROW(detail::compute_capability::checkComputeCapability(-1));
+    EXPECT_ANY_THROW(detail::compute_capability::checkComputeCapability(device_count));
 }
 
 /**
@@ -68,7 +68,7 @@ TEST(TestUtilComputeCapability, checkComputeCapability) {
  * This depends on the CUDA version used, and the dynamically linked nvrtc (when CUDA >= 11.2) so this is not ideal to test. 
  */
 TEST(TestUtilComputeCapability, getNVRTCSupportedComputeCapabilties) {
-    std::vector<int> architectures = util::detail::compute_capability::getNVRTCSupportedComputeCapabilties();
+    std::vector<int> architectures = detail::compute_capability::getNVRTCSupportedComputeCapabilties();
 
     // CUDA 11.2+ we do not know what values or how many this should return, so just assume a non zero number will be returned (in case of future additions / removals)
     #if (__CUDACC_VER_MAJOR__ > 11) || ((__CUDACC_VER_MAJOR__ == 11) && __CUDACC_VER_MINOR__ >= 2)
@@ -93,30 +93,30 @@ TEST(TestUtilComputeCapability, getNVRTCSupportedComputeCapabilties) {
  */
 TEST(TestUtilComputeCapability, selectAppropraiteComputeCapability) {
     // Check an exact match should be found
-    EXPECT_EQ(util::detail::compute_capability::selectAppropraiteComputeCapability(86, {86}), 86);
+    EXPECT_EQ(detail::compute_capability::selectAppropraiteComputeCapability(86, {86}), 86);
     // Check a miss but with a lower value returns the lower value
-    EXPECT_EQ(util::detail::compute_capability::selectAppropraiteComputeCapability(86, {80}), 80);
+    EXPECT_EQ(detail::compute_capability::selectAppropraiteComputeCapability(86, {80}), 80);
     // Check a miss without a valid value returns 0
-    EXPECT_EQ(util::detail::compute_capability::selectAppropraiteComputeCapability(86, {90}), 0);
+    EXPECT_EQ(detail::compute_capability::selectAppropraiteComputeCapability(86, {90}), 0);
     // Check a miss occurs when no values are present in the vector.
-    EXPECT_EQ(util::detail::compute_capability::selectAppropraiteComputeCapability(86, {}), 0);
+    EXPECT_EQ(detail::compute_capability::selectAppropraiteComputeCapability(86, {}), 0);
 
     // CUDA 11.1-11.6, 35 to 86, 86 and 60 should be found, 30 should not.
     std::vector<int> CUDA_11_1_ARCHES = {35, 37, 50, 52, 53, 60, 61, 62, 70, 72, 75, 80, 86};
-    EXPECT_EQ(util::detail::compute_capability::selectAppropraiteComputeCapability(86, CUDA_11_1_ARCHES), 86);
-    EXPECT_EQ(util::detail::compute_capability::selectAppropraiteComputeCapability(60, CUDA_11_1_ARCHES), 60);
-    EXPECT_EQ(util::detail::compute_capability::selectAppropraiteComputeCapability(30, CUDA_11_1_ARCHES), 0);
+    EXPECT_EQ(detail::compute_capability::selectAppropraiteComputeCapability(86, CUDA_11_1_ARCHES), 86);
+    EXPECT_EQ(detail::compute_capability::selectAppropraiteComputeCapability(60, CUDA_11_1_ARCHES), 60);
+    EXPECT_EQ(detail::compute_capability::selectAppropraiteComputeCapability(30, CUDA_11_1_ARCHES), 0);
 
     // CUDA 11.0, 86 should not be found, but 80 should be used instead. 60 should be found, 30 should not.
     std::vector<int> CUDA_11_0_ARCHES = {35, 37, 50, 52, 53, 60, 61, 62, 70, 72, 75, 80};
-    EXPECT_EQ(util::detail::compute_capability::selectAppropraiteComputeCapability(86, CUDA_11_0_ARCHES), 80);
-    EXPECT_EQ(util::detail::compute_capability::selectAppropraiteComputeCapability(60, CUDA_11_0_ARCHES), 60);
-    EXPECT_EQ(util::detail::compute_capability::selectAppropraiteComputeCapability(30, CUDA_11_0_ARCHES), 0);
+    EXPECT_EQ(detail::compute_capability::selectAppropraiteComputeCapability(86, CUDA_11_0_ARCHES), 80);
+    EXPECT_EQ(detail::compute_capability::selectAppropraiteComputeCapability(60, CUDA_11_0_ARCHES), 60);
+    EXPECT_EQ(detail::compute_capability::selectAppropraiteComputeCapability(30, CUDA_11_0_ARCHES), 0);
     // CUDA 10.0, 86 should not be found, 75 should be used. 60 should be found, 30 should eb found.
     std::vector<int> CUDA_10_0_ARCHES = {30, 32, 35, 37, 50, 52, 53, 60, 61, 62, 70, 72, 75};
-    EXPECT_EQ(util::detail::compute_capability::selectAppropraiteComputeCapability(86, CUDA_10_0_ARCHES), 75);
-    EXPECT_EQ(util::detail::compute_capability::selectAppropraiteComputeCapability(60, CUDA_10_0_ARCHES), 60);
-    EXPECT_EQ(util::detail::compute_capability::selectAppropraiteComputeCapability(30, CUDA_10_0_ARCHES), 30);
+    EXPECT_EQ(detail::compute_capability::selectAppropraiteComputeCapability(86, CUDA_10_0_ARCHES), 75);
+    EXPECT_EQ(detail::compute_capability::selectAppropraiteComputeCapability(60, CUDA_10_0_ARCHES), 60);
+    EXPECT_EQ(detail::compute_capability::selectAppropraiteComputeCapability(30, CUDA_10_0_ARCHES), 30);
 }
 
 }  // namespace flamegpu
diff --git a/tests/test_cases/util/test_cuda.cu b/tests/test_cases/detail/test_cuda.cu
similarity index 86%
rename from tests/test_cases/util/test_cuda.cu
rename to tests/test_cases/detail/test_cuda.cu
index 65c10d343..aa70aff4d 100644
--- a/tests/test_cases/util/test_cuda.cu
+++ b/tests/test_cases/detail/test_cuda.cu
@@ -1,8 +1,8 @@
 #include <cuda_runtime.h>
 
 #include <vector>
-#include "flamegpu/util/detail/cuda.cuh"
-#include "flamegpu/gpu/detail/CUDAErrorChecking.cuh"
+#include "flamegpu/detail/cuda.cuh"
+#include "flamegpu/simulation/detail/CUDAErrorChecking.cuh"
 
 #include "gtest/gtest.h"
 namespace flamegpu {
@@ -19,7 +19,7 @@ TEST(TestUtilDetailCuda, cudaFree) {
     gpuErrchk(cudaPointerGetAttributes(&attributes, d_ptr));
     EXPECT_EQ(attributes.type, cudaMemoryTypeDevice);
     // call the wrapped cuda free method
-    status = util::detail::cuda::cudaFree(d_ptr);
+    status = detail::cuda::cudaFree(d_ptr);
     // It should not have thrown any cuda errors in normal use.
     EXPECT_EQ(status, cudaSuccess);
     // The pointer will still have a non nullptr value, but it will no longer be a valid device ptr.
@@ -27,7 +27,7 @@ TEST(TestUtilDetailCuda, cudaFree) {
     gpuErrchk(cudaPointerGetAttributes(&attributes, d_ptr));
     EXPECT_EQ(attributes.type, cudaMemoryTypeUnregistered);
     // Try a double free.
-    status = util::detail::cuda::cudaFree(d_ptr);
+    status = detail::cuda::cudaFree(d_ptr);
     // This will appear to succeed (a double free is identical to a device reset then free according from cudaPointerGetAttributes' perspective), which is a difference from actual cudaFree which would return cudaErrorInvalidValue.
     EXPECT_EQ(status, cudaSuccess);
     // reset the ptr
@@ -41,7 +41,7 @@ TEST(TestUtilDetailCuda, cudaFree) {
     // Trigger a device reset
     cudaDeviceReset();
     // Attempt to free the ptr, this method should claim all things are fine (as the dev ptr has implicitly been freed)
-    status = util::detail::cuda::cudaFree(d_ptr);
+    status = detail::cuda::cudaFree(d_ptr);
     EXPECT_EQ(status, cudaSuccess);
 }
 
@@ -57,7 +57,7 @@ TEST(TestUtilDetailCuda, cudaFreeHost) {
     // this appears to return cudaMemoryTypeHost, even though it should return cudaMemoryTypeHost
     EXPECT_EQ(attributes.type, cudaMemoryTypeHost);
     // call the wrapped cuda free method
-    status = util::detail::cuda::cudaFreeHost(p_ptr);
+    status = detail::cuda::cudaFreeHost(p_ptr);
     // It should not have thrown any cuda errors in normal use.
     EXPECT_EQ(status, cudaSuccess);
     // The pointer will still have a non nullptr value, but it will no longer be a valid page-locked ptr.
@@ -66,7 +66,7 @@ TEST(TestUtilDetailCuda, cudaFreeHost) {
     EXPECT_EQ(attributes.type, cudaMemoryTypeUnregistered);
 
     // Try a double free.
-    status = util::detail::cuda::cudaFreeHost(p_ptr);
+    status = detail::cuda::cudaFreeHost(p_ptr);
     // This will appear to succeed (a double free is identical to a device reset then free according from cudaPointerGetAttributes' perspective), which is a difference from actual cudaFreeHost which would return cudaErrorInvalidValue.
     EXPECT_EQ(status, cudaSuccess);
     // reset the ptr
@@ -81,7 +81,7 @@ TEST(TestUtilDetailCuda, cudaFreeHost) {
     // Trigger a device reset
     cudaDeviceReset();
     // Attempt to free the ptr, this method should claim all things are fine (as the dev ptr has implicitly been freed)
-    status = util::detail::cuda::cudaFreeHost(p_ptr);
+    status = detail::cuda::cudaFreeHost(p_ptr);
     EXPECT_EQ(status, cudaSuccess);
 }
 
@@ -93,21 +93,21 @@ TEST(TestUtilDetailCuda, cuDevicePrimaryContextIsActive) {
     gpuErrchk(cudaFree(0));
     // check if the primary context is active or not for device 0, it shoudl be.
     bool isActive = false;
-    isActive = util::detail::cuda::cuDevicePrimaryContextIsActive(0);
+    isActive = detail::cuda::cuDevicePrimaryContextIsActive(0);
     EXPECT_EQ(isActive, true);
     // Call device reset and check again without establishing a new context, it should not be active.
     gpuErrchk(cudaDeviceReset());
-    isActive = util::detail::cuda::cuDevicePrimaryContextIsActive(0);
+    isActive = detail::cuda::cuDevicePrimaryContextIsActive(0);
     EXPECT_EQ(isActive, false);
     // Check that exceptions will be raised correctly when passing bad device ordinals.
     // Expect an exception if the ordinal is negative
-    EXPECT_THROW(util::detail::cuda::cuDevicePrimaryContextIsActive(-1), exception::InvalidCUDAdevice);
+    EXPECT_THROW(detail::cuda::cuDevicePrimaryContextIsActive(-1), exception::InvalidCUDAdevice);
     // First grab the device count, to check for exceptions when the device ordinal is too big.
     int deviceCount = 0;
     gpuErrchk(cudaGetDeviceCount(&deviceCount));
     if (deviceCount > 0) {
         // Expect an exception if the ordinal is too big.
-        EXPECT_THROW(util::detail::cuda::cuDevicePrimaryContextIsActive(deviceCount), exception::InvalidCUDAdevice);
+        EXPECT_THROW(detail::cuda::cuDevicePrimaryContextIsActive(deviceCount), exception::InvalidCUDAdevice);
     }
 }
 
diff --git a/tests/test_cases/detail/test_cxxname.cpp b/tests/test_cases/detail/test_cxxname.cpp
new file mode 100644
index 000000000..4c1a1241a
--- /dev/null
+++ b/tests/test_cases/detail/test_cxxname.cpp
@@ -0,0 +1,24 @@
+#include <string>
+#include "flamegpu/detail/cxxname.hpp"
+
+#include "gtest/gtest.h"
+
+namespace test_cxxname {
+
+/**
+ * Tests if cxxname::getUnqualified name behaves as intended.
+ */
+TEST(TestUtilCxxname, getUnqualifiedName) {
+    // Check with no qualification
+    EXPECT_EQ(flamegpu::detail::cxxname::getUnqualifiedName(std::string("ClassName")), std::string("ClassName"));
+    // Check with one qualifier
+    EXPECT_EQ(flamegpu::detail::cxxname::getUnqualifiedName(std::string("namespace::ClassName")), std::string("ClassName"));
+
+    // Check with two qualifiers
+    EXPECT_EQ(flamegpu::detail::cxxname::getUnqualifiedName(std::string("namespace::subnamespace::ClassName")), std::string("ClassName"));
+
+    // Check with const char * as an argument
+    EXPECT_EQ(flamegpu::detail::cxxname::getUnqualifiedName("namespace::ClassName"), "ClassName");
+}
+
+}  // namespace test_cxxname
diff --git a/tests/test_cases/util/test_dependency_versions.cu b/tests/test_cases/detail/test_dependency_versions.cu
similarity index 100%
rename from tests/test_cases/util/test_dependency_versions.cu
rename to tests/test_cases/detail/test_dependency_versions.cu
diff --git a/tests/test_cases/util/test_multi_thread_device.cu b/tests/test_cases/detail/test_multi_thread_device.cu
similarity index 98%
rename from tests/test_cases/util/test_multi_thread_device.cu
rename to tests/test_cases/detail/test_multi_thread_device.cu
index 6abcded56..baf341a14 100644
--- a/tests/test_cases/util/test_multi_thread_device.cu
+++ b/tests/test_cases/detail/test_multi_thread_device.cu
@@ -4,8 +4,8 @@
 
 #include "flamegpu/flamegpu.h"
 #include "gtest/gtest.h"
-#include "flamegpu/util/detail/compute_capability.cuh"
-#include "flamegpu/util/detail/cuda.cuh"
+#include "flamegpu/detail/compute_capability.cuh"
+#include "flamegpu/detail/cuda.cuh"
 
 namespace flamegpu {
 
@@ -518,7 +518,7 @@ TEST(MultiThreadDeviceTest, SameModelMultiDevice_Agent) {
     // For each device
     for (int device = 0; device < devices; ++device) {
         // If built with a suitable compute capability
-        if (util::detail::compute_capability::checkComputeCapability(device)) {
+        if (detail::compute_capability::checkComputeCapability(device)) {
             for (int i = 0; i < SIMS_PER_DEVICE; ++i) {
                 // Set sim Running
                 sims.emplace(sims.end(), std::make_shared<CUDASimulation>(m));
@@ -582,7 +582,7 @@ TEST(MultiThreadDeviceTest, SameModelMultiDevice_Message) {
     // For each device
     for (int device = 0; device < devices; ++device) {
         // If built with a suitable compute capability
-        if (util::detail::compute_capability::checkComputeCapability(device)) {
+        if (detail::compute_capability::checkComputeCapability(device)) {
             for (int i = 0; i < SIMS_PER_DEVICE; ++i) {
                 // Set sim Running
                 sims.emplace(sims.end(), std::make_shared<CUDASimulation>(m));
@@ -645,7 +645,7 @@ TEST(MultiThreadDeviceTest, SameModelMultiDevice_Environment) {
     // BEGIN: Attempt to pre init contexts
     for (int device = 0; device < devices; ++device) {
         ASSERT_EQ(cudaSetDevice(device), cudaSuccess);
-        ASSERT_EQ(flamegpu::util::detail::cuda::cudaFree(nullptr), cudaSuccess);
+        ASSERT_EQ(flamegpu::detail::cuda::cudaFree(nullptr), cudaSuccess);
     }
     ASSERT_EQ(cudaSetDevice(0), cudaSuccess);
     // END: Attempt to pre init contexts
@@ -658,7 +658,7 @@ TEST(MultiThreadDeviceTest, SameModelMultiDevice_Environment) {
     // For each device
     for (int device = 0; device < devices; ++device) {
         // If built with a suitable compute capability
-        if (util::detail::compute_capability::checkComputeCapability(device)) {
+        if (detail::compute_capability::checkComputeCapability(device)) {
             for (int i = 0; i < SIMS_PER_DEVICE; ++i) {
                 // Set sim Running
                 m.Environment().setProperty<int>("one", 1 * (offset + 1));
@@ -743,7 +743,7 @@ TEST(MultiThreadDeviceTest, SameModelMultiDevice_AgentOutput) {
     // For each device
     for (int device = 0; device < devices; ++device) {
         // If built with a suitable compute capability
-        if (util::detail::compute_capability::checkComputeCapability(device)) {
+        if (detail::compute_capability::checkComputeCapability(device)) {
             for (int i = 0; i < SIMS_PER_DEVICE; ++i) {
                 // Set sim Running
                 sims.emplace(sims.end(), std::make_shared<CUDASimulation>(m));
@@ -818,7 +818,7 @@ TEST(MultiThreadDeviceTest, SameModelMultiDevice_AgentFunctionCondition) {
     // For each device
     for (int device = 0; device < devices; ++device) {
         // If built with a suitable compute capability
-        if (util::detail::compute_capability::checkComputeCapability(device)) {
+        if (detail::compute_capability::checkComputeCapability(device)) {
             for (int i = 0; i < SIMS_PER_DEVICE; ++i) {
                 // Set sim Running
                 sims.emplace(sims.end(), std::make_shared<CUDASimulation>(m));
diff --git a/tests/test_cases/util/test_rtc_multi_thread_device.cu b/tests/test_cases/detail/test_rtc_multi_thread_device.cu
similarity index 97%
rename from tests/test_cases/util/test_rtc_multi_thread_device.cu
rename to tests/test_cases/detail/test_rtc_multi_thread_device.cu
index 4984e1f0b..04de77b52 100644
--- a/tests/test_cases/util/test_rtc_multi_thread_device.cu
+++ b/tests/test_cases/detail/test_rtc_multi_thread_device.cu
@@ -4,8 +4,8 @@
 
 #include "flamegpu/flamegpu.h"
 #include "gtest/gtest.h"
-#include "flamegpu/util/detail/compute_capability.cuh"
-#include "flamegpu/util/detail/cuda.cuh"
+#include "flamegpu/detail/compute_capability.cuh"
+#include "flamegpu/detail/cuda.cuh"
 
 namespace flamegpu {
 
@@ -138,7 +138,7 @@ TEST(RTCMultiThreadDeviceTest, SameModelMultiDevice_Message) {
     // For each device
     for (int device = 0; device < devices; ++device) {
         // If built with a suitable compute capability
-        if (util::detail::compute_capability::checkComputeCapability(device)) {
+        if (detail::compute_capability::checkComputeCapability(device)) {
             for (int i = 0; i < SIMS_PER_DEVICE; ++i) {
                 // Set sim Running
                 sims.emplace(sims.end(), std::make_shared<CUDASimulation>(m));
@@ -201,7 +201,7 @@ TEST(RTCMultiThreadDeviceTest, SameModelMultiDevice_Environment) {
     // BEGIN: Attempt to pre init contexts
     for (int device = 0; device < devices; ++device) {
         ASSERT_EQ(cudaSetDevice(device), cudaSuccess);
-        ASSERT_EQ(flamegpu::util::detail::cuda::cudaFree(nullptr), cudaSuccess);
+        ASSERT_EQ(flamegpu::detail::cuda::cudaFree(nullptr), cudaSuccess);
     }
     ASSERT_EQ(cudaSetDevice(0), cudaSuccess);
     // END: Attempt to pre init contexts
@@ -214,7 +214,7 @@ TEST(RTCMultiThreadDeviceTest, SameModelMultiDevice_Environment) {
     // For each device
     for (int device = 0; device < devices; ++device) {
         // If built with a suitable compute capability
-        if (util::detail::compute_capability::checkComputeCapability(device)) {
+        if (detail::compute_capability::checkComputeCapability(device)) {
             for (int i = 0; i < SIMS_PER_DEVICE; ++i) {
                 // Set sim Running
                 m.Environment().setProperty<int>("one", 1 * (offset + 1));
@@ -300,7 +300,7 @@ TEST(RTCMultiThreadDeviceTest, SameModelMultiDevice_AgentOutput) {
     // For each device
     for (int device = 0; device < devices; ++device) {
         // If built with a suitable compute capability
-        if (util::detail::compute_capability::checkComputeCapability(device)) {
+        if (detail::compute_capability::checkComputeCapability(device)) {
             for (int i = 0; i < SIMS_PER_DEVICE; ++i) {
                 // Set sim Running
                 sims.emplace(sims.end(), std::make_shared<CUDASimulation>(m));
@@ -376,7 +376,7 @@ TEST(RTCMultiThreadDeviceTest, SameModelMultiDevice_AgentFunctionCondition) {
     // For each device
     for (int device = 0; device < devices; ++device) {
         // If built with a suitable compute capability
-        if (util::detail::compute_capability::checkComputeCapability(device)) {
+        if (detail::compute_capability::checkComputeCapability(device)) {
             for (int i = 0; i < SIMS_PER_DEVICE; ++i) {
                 // Set sim Running
                 sims.emplace(sims.end(), std::make_shared<CUDASimulation>(m));
diff --git a/tests/test_cases/util/test_wddm.cu b/tests/test_cases/detail/test_wddm.cu
similarity index 82%
rename from tests/test_cases/util/test_wddm.cu
rename to tests/test_cases/detail/test_wddm.cu
index c64fd7269..a24059592 100644
--- a/tests/test_cases/util/test_wddm.cu
+++ b/tests/test_cases/detail/test_wddm.cu
@@ -1,7 +1,7 @@
 #include <cuda_runtime.h>
 
-#include "flamegpu/util/detail/wddm.cuh"
-#include "flamegpu/gpu/detail/CUDAErrorChecking.cuh"
+#include "flamegpu/detail/wddm.cuh"
+#include "flamegpu/simulation/detail/CUDAErrorChecking.cuh"
 
 #include "gtest/gtest.h"
 namespace flamegpu {
@@ -29,12 +29,12 @@ TEST(TestUtilWDDM, deviceIsWDDM) {
             reference = !tccDriver;
         #endif
         // The function should return the reference value.
-        EXPECT_EQ(util::detail::wddm::deviceIsWDDM(i), reference);
+        EXPECT_EQ(detail::wddm::deviceIsWDDM(i), reference);
     }
 
     // If the function is given a bad index, it should throw.
-    EXPECT_ANY_THROW(util::detail::wddm::deviceIsWDDM(-1));
-    EXPECT_ANY_THROW(util::detail::wddm::deviceIsWDDM(device_count));
+    EXPECT_ANY_THROW(detail::wddm::deviceIsWDDM(-1));
+    EXPECT_ANY_THROW(detail::wddm::deviceIsWDDM(device_count));
 
     // Also check for the current device.
     int currentDeviceIndex = 0;
@@ -47,6 +47,6 @@ TEST(TestUtilWDDM, deviceIsWDDM) {
         // WDDM driver is if not the tcc driver, and on windows.
         reference = !tccDriver;
     #endif
-    EXPECT_EQ(util::detail::wddm::deviceIsWDDM(), reference);
+    EXPECT_EQ(detail::wddm::deviceIsWDDM(), reference);
 }
 }  // namespace flamegpu
diff --git a/tests/test_cases/model/test_environment_description.cu b/tests/test_cases/model/test_environment_description.cu
index 2c1886b86..7631d775a 100644
--- a/tests/test_cases/model/test_environment_description.cu
+++ b/tests/test_cases/model/test_environment_description.cu
@@ -88,8 +88,8 @@ void AddGet_SetGet_array_vec_test() {
     std::array<T, ARRAY_TEST_LEN> b;
     std::array<T, ARRAY_TEST_LEN> c;
     for (int i = 0; i < ARRAY_TEST_LEN; ++i) {
-        b[i] = T(static_cast<typename type_decode<T>::type_t>(i));
-        c[i] = T(static_cast<typename type_decode<T>::type_t>(ARRAY_TEST_LEN - i));
+        b[i] = T(static_cast<typename detail::type_decode<T>::type_t>(i));
+        c[i] = T(static_cast<typename detail::type_decode<T>::type_t>(ARRAY_TEST_LEN - i));
     }
     ed.newProperty<T, ARRAY_TEST_LEN>("a", b);
     std::array<T, ARRAY_TEST_LEN> a;
@@ -140,8 +140,8 @@ void AddGet_SetGet_array_element_vec_test() {
     std::array<T, ARRAY_TEST_LEN> b;
     std::array<T, ARRAY_TEST_LEN> c;
     for (int i = 0; i < ARRAY_TEST_LEN; ++i) {
-        b[i] = T(static_cast<typename type_decode<T>::type_t>(i));
-        c[i] = T(static_cast<typename type_decode<T>::type_t>(ARRAY_TEST_LEN - i));
+        b[i] = T(static_cast<typename detail::type_decode<T>::type_t>(i));
+        c[i] = T(static_cast<typename detail::type_decode<T>::type_t>(ARRAY_TEST_LEN - i));
     }
     ed.newProperty<T, ARRAY_TEST_LEN>("a", b);
     for (int i = 0; i < ARRAY_TEST_LEN; ++i) {
diff --git a/tests/test_cases/runtime/test_agent_state_transition.cu b/tests/test_cases/runtime/agent/detail/test_agent_state_transition.cu
similarity index 100%
rename from tests/test_cases/runtime/test_agent_state_transition.cu
rename to tests/test_cases/runtime/agent/detail/test_agent_state_transition.cu
diff --git a/tests/test_cases/runtime/test_spatial_agent_sort.cu b/tests/test_cases/runtime/agent/detail/test_spatial_agent_sort.cu
similarity index 100%
rename from tests/test_cases/runtime/test_spatial_agent_sort.cu
rename to tests/test_cases/runtime/agent/detail/test_spatial_agent_sort.cu
diff --git a/tests/test_cases/runtime/host_reduction/test_count.cu b/tests/test_cases/runtime/agent/host_reduction/test_count.cu
similarity index 100%
rename from tests/test_cases/runtime/host_reduction/test_count.cu
rename to tests/test_cases/runtime/agent/host_reduction/test_count.cu
diff --git a/tests/test_cases/runtime/host_reduction/test_histogram_even.cu b/tests/test_cases/runtime/agent/host_reduction/test_histogram_even.cu
similarity index 100%
rename from tests/test_cases/runtime/host_reduction/test_histogram_even.cu
rename to tests/test_cases/runtime/agent/host_reduction/test_histogram_even.cu
diff --git a/tests/test_cases/runtime/host_reduction/test_max.cu b/tests/test_cases/runtime/agent/host_reduction/test_max.cu
similarity index 100%
rename from tests/test_cases/runtime/host_reduction/test_max.cu
rename to tests/test_cases/runtime/agent/host_reduction/test_max.cu
diff --git a/tests/test_cases/runtime/host_reduction/test_mean_standarddeviation.cu b/tests/test_cases/runtime/agent/host_reduction/test_mean_standarddeviation.cu
similarity index 100%
rename from tests/test_cases/runtime/host_reduction/test_mean_standarddeviation.cu
rename to tests/test_cases/runtime/agent/host_reduction/test_mean_standarddeviation.cu
diff --git a/tests/test_cases/runtime/host_reduction/test_min.cu b/tests/test_cases/runtime/agent/host_reduction/test_min.cu
similarity index 100%
rename from tests/test_cases/runtime/host_reduction/test_min.cu
rename to tests/test_cases/runtime/agent/host_reduction/test_min.cu
diff --git a/tests/test_cases/runtime/host_reduction/test_misc.cu b/tests/test_cases/runtime/agent/host_reduction/test_misc.cu
similarity index 100%
rename from tests/test_cases/runtime/host_reduction/test_misc.cu
rename to tests/test_cases/runtime/agent/host_reduction/test_misc.cu
diff --git a/tests/test_cases/runtime/host_reduction/test_reduce.cu b/tests/test_cases/runtime/agent/host_reduction/test_reduce.cu
similarity index 100%
rename from tests/test_cases/runtime/host_reduction/test_reduce.cu
rename to tests/test_cases/runtime/agent/host_reduction/test_reduce.cu
diff --git a/tests/test_cases/runtime/host_reduction/test_sum.cu b/tests/test_cases/runtime/agent/host_reduction/test_sum.cu
similarity index 100%
rename from tests/test_cases/runtime/host_reduction/test_sum.cu
rename to tests/test_cases/runtime/agent/host_reduction/test_sum.cu
diff --git a/tests/test_cases/runtime/host_reduction/test_transform_reduce.cu b/tests/test_cases/runtime/agent/host_reduction/test_transform_reduce.cu
similarity index 100%
rename from tests/test_cases/runtime/host_reduction/test_transform_reduce.cu
rename to tests/test_cases/runtime/agent/host_reduction/test_transform_reduce.cu
diff --git a/tests/test_cases/runtime/test_device_agent_creation.cu b/tests/test_cases/runtime/agent/test_device_agent_creation.cu
similarity index 100%
rename from tests/test_cases/runtime/test_device_agent_creation.cu
rename to tests/test_cases/runtime/agent/test_device_agent_creation.cu
diff --git a/tests/test_cases/pop/test_device_agent_vector.cu b/tests/test_cases/runtime/agent/test_device_agent_vector.cu
similarity index 100%
rename from tests/test_cases/pop/test_device_agent_vector.cu
rename to tests/test_cases/runtime/agent/test_device_agent_vector.cu
diff --git a/tests/test_cases/runtime/test_host_agent_creation.cu b/tests/test_cases/runtime/agent/test_host_agent_creation.cu
similarity index 100%
rename from tests/test_cases/runtime/test_host_agent_creation.cu
rename to tests/test_cases/runtime/agent/test_host_agent_creation.cu
diff --git a/tests/test_cases/runtime/test_host_agent_sort.cu b/tests/test_cases/runtime/agent/test_host_agent_sort.cu
similarity index 100%
rename from tests/test_cases/runtime/test_host_agent_sort.cu
rename to tests/test_cases/runtime/agent/test_host_agent_sort.cu
diff --git a/tests/test_cases/runtime/test_device_environment.cu b/tests/test_cases/runtime/environment/test_device_environment.cu
similarity index 100%
rename from tests/test_cases/runtime/test_device_environment.cu
rename to tests/test_cases/runtime/environment/test_device_environment.cu
diff --git a/tests/test_cases/runtime/test_device_macro_property.cu b/tests/test_cases/runtime/environment/test_device_macro_property.cu
similarity index 100%
rename from tests/test_cases/runtime/test_device_macro_property.cu
rename to tests/test_cases/runtime/environment/test_device_macro_property.cu
diff --git a/tests/test_cases/runtime/test_environment_manager.cu b/tests/test_cases/runtime/environment/test_environment_manager.cu
similarity index 100%
rename from tests/test_cases/runtime/test_environment_manager.cu
rename to tests/test_cases/runtime/environment/test_environment_manager.cu
diff --git a/tests/test_cases/runtime/test_host_environment.cu b/tests/test_cases/runtime/environment/test_host_environment.cu
similarity index 99%
rename from tests/test_cases/runtime/test_host_environment.cu
rename to tests/test_cases/runtime/environment/test_host_environment.cu
index 0d31d95a8..054f7002f 100644
--- a/tests/test_cases/runtime/test_host_environment.cu
+++ b/tests/test_cases/runtime/environment/test_host_environment.cu
@@ -66,7 +66,7 @@ class MiniSim {
     static std::array<T, TEST_ARRAY_LEN> makeInit(int offset = 0) {
         std::array<T, TEST_ARRAY_LEN> init;
         for (int i = 0; i < TEST_ARRAY_LEN; ++i)
-            init[i] = T{static_cast<typename type_decode<T>::type_t>(i + 1 + offset)};
+            init[i] = T{static_cast<typename detail::type_decode<T>::type_t>(i + 1 + offset)};
         return init;
     }
     void run(int steps = 2) {
diff --git a/tests/test_cases/runtime/test_host_macro_property.cu b/tests/test_cases/runtime/environment/test_host_macro_property.cu
similarity index 100%
rename from tests/test_cases/runtime/test_host_macro_property.cu
rename to tests/test_cases/runtime/environment/test_host_macro_property.cu
diff --git a/tests/test_cases/runtime/test_subenvironment_manager.cu b/tests/test_cases/runtime/environment/test_subenvironment_manager.cu
similarity index 100%
rename from tests/test_cases/runtime/test_subenvironment_manager.cu
rename to tests/test_cases/runtime/environment/test_subenvironment_manager.cu
diff --git a/tests/test_cases/runtime/test_agent_random.cu b/tests/test_cases/runtime/random/test_agent_random.cu
similarity index 100%
rename from tests/test_cases/runtime/test_agent_random.cu
rename to tests/test_cases/runtime/random/test_agent_random.cu
diff --git a/tests/test_cases/runtime/test_host_random.cu b/tests/test_cases/runtime/random/test_host_random.cu
similarity index 100%
rename from tests/test_cases/runtime/test_host_random.cu
rename to tests/test_cases/runtime/random/test_host_random.cu
diff --git a/tests/test_cases/gpu/test_cuda_subagent.cu b/tests/test_cases/simulation/detail/test_cuda_subagent.cu
similarity index 100%
rename from tests/test_cases/gpu/test_cuda_subagent.cu
rename to tests/test_cases/simulation/detail/test_cuda_subagent.cu
diff --git a/tests/test_cases/gpu/test_cuda_submacroenvironment.cu b/tests/test_cases/simulation/detail/test_cuda_submacroenvironment.cu
similarity index 100%
rename from tests/test_cases/gpu/test_cuda_submacroenvironment.cu
rename to tests/test_cases/simulation/detail/test_cuda_submacroenvironment.cu
diff --git a/tests/test_cases/sim/test_RunPlan.cu b/tests/test_cases/simulation/test_RunPlan.cu
similarity index 100%
rename from tests/test_cases/sim/test_RunPlan.cu
rename to tests/test_cases/simulation/test_RunPlan.cu
diff --git a/tests/test_cases/sim/test_RunPlanVector.cu b/tests/test_cases/simulation/test_RunPlanVector.cu
similarity index 100%
rename from tests/test_cases/sim/test_RunPlanVector.cu
rename to tests/test_cases/simulation/test_RunPlanVector.cu
diff --git a/tests/test_cases/pop/test_agent_instance.cu b/tests/test_cases/simulation/test_agent_instance.cu
similarity index 100%
rename from tests/test_cases/pop/test_agent_instance.cu
rename to tests/test_cases/simulation/test_agent_instance.cu
diff --git a/tests/test_cases/pop/test_agent_vector.cu b/tests/test_cases/simulation/test_agent_vector.cu
similarity index 100%
rename from tests/test_cases/pop/test_agent_vector.cu
rename to tests/test_cases/simulation/test_agent_vector.cu
diff --git a/tests/test_cases/gpu/test_cuda_ensemble.cu b/tests/test_cases/simulation/test_cuda_ensemble.cu
similarity index 100%
rename from tests/test_cases/gpu/test_cuda_ensemble.cu
rename to tests/test_cases/simulation/test_cuda_ensemble.cu
diff --git a/tests/test_cases/gpu/test_cuda_simulation.cu b/tests/test_cases/simulation/test_cuda_simulation.cu
similarity index 99%
rename from tests/test_cases/gpu/test_cuda_simulation.cu
rename to tests/test_cases/simulation/test_cuda_simulation.cu
index 033c16ee7..2bf17e28e 100644
--- a/tests/test_cases/gpu/test_cuda_simulation.cu
+++ b/tests/test_cases/simulation/test_cuda_simulation.cu
@@ -4,7 +4,7 @@
 #include <set>
 
 #include "flamegpu/flamegpu.h"
-#include "flamegpu/util/detail/compute_capability.cuh"
+#include "flamegpu/detail/compute_capability.cuh"
 #include "helpers/device_initialisation.h"
 #include "flamegpu/io/Telemetry.h"
 
@@ -63,7 +63,7 @@ TEST(TestCUDASimulation, AllDeviceIdValues) {
     }
     for (int i = 0; i < device_count; i++) {
         // Check if the specified device is allowed to run the tests to determine if the test should throw or not. This is system dependent so must be dynamic.
-        bool shouldThrowCCException = !flamegpu::util::detail::compute_capability::checkComputeCapability(i);
+        bool shouldThrowCCException = !flamegpu::detail::compute_capability::checkComputeCapability(i);
         // Initialise and run a simple model on each device in the system. This test is pointless on single GPU machines.
         ModelDescription m(MODEL_NAME);
         m.newAgent(AGENT_NAME);
@@ -305,7 +305,7 @@ TEST(TestCUDASimulation, GetAgent) {
     c.SimulationConfig().steps = 1;
     c.setPopulationData(pop);
     c.simulate();
-    AgentInterface &agent = c.getAgent(AGENT_NAME);
+    detail::AgentInterface &agent = c.getAgent(AGENT_NAME);
     for (int _i = 0; _i < AGENT_COUNT; ++_i) {
         int host = 0;
         cudaMemcpy(&host, reinterpret_cast<int*>(agent.getStateVariablePtr(ModelData::DEFAULT_STATE, VARIABLE_NAME)) + _i, sizeof(int), cudaMemcpyDeviceToHost);
diff --git a/tests/test_cases/gpu/test_cuda_simulation_concurrency.cu b/tests/test_cases/simulation/test_cuda_simulation_concurrency.cu
similarity index 99%
rename from tests/test_cases/gpu/test_cuda_simulation_concurrency.cu
rename to tests/test_cases/simulation/test_cuda_simulation_concurrency.cu
index d9a908285..7a25a4a54 100644
--- a/tests/test_cases/gpu/test_cuda_simulation_concurrency.cu
+++ b/tests/test_cases/simulation/test_cuda_simulation_concurrency.cu
@@ -1,5 +1,5 @@
 #include "flamegpu/flamegpu.h"
-#include "flamegpu/util/detail/compute_capability.cuh"
+#include "flamegpu/detail/compute_capability.cuh"
 #include "helpers/device_initialisation.h"
 
 #include "gtest/gtest.h"
diff --git a/tests/test_cases/gpu/test_gpu_validation.cu b/tests/test_cases/simulation/test_gpu_validation.cu
similarity index 100%
rename from tests/test_cases/gpu/test_gpu_validation.cu
rename to tests/test_cases/simulation/test_gpu_validation.cu
diff --git a/tests/test_cases/sim/test_host_functions.cu b/tests/test_cases/simulation/test_host_functions.cu
similarity index 100%
rename from tests/test_cases/sim/test_host_functions.cu
rename to tests/test_cases/simulation/test_host_functions.cu
diff --git a/tests/test_cases/util/test_cleanup.cu b/tests/test_cases/util/test_cleanup.cu
index c81e43a7f..0bbe42c8c 100644
--- a/tests/test_cases/util/test_cleanup.cu
+++ b/tests/test_cases/util/test_cleanup.cu
@@ -2,7 +2,7 @@
 
 #include <vector>
 #include "flamegpu/util/cleanup.h"
-#include "flamegpu/gpu/detail/CUDAErrorChecking.cuh"
+#include "flamegpu/simulation/detail/CUDAErrorChecking.cuh"
 #include "flamegpu/flamegpu.h"
 
 #include "gtest/gtest.h"
diff --git a/tests/test_cases/util/test_cxxname.cpp b/tests/test_cases/util/test_cxxname.cpp
deleted file mode 100644
index d2ff1b9d1..000000000
--- a/tests/test_cases/util/test_cxxname.cpp
+++ /dev/null
@@ -1,24 +0,0 @@
-#include <string>
-#include "flamegpu/util/detail/cxxname.hpp"
-
-#include "gtest/gtest.h"
-
-namespace test_cxxname {
-
-/**
- * Tests if cxxname::getUnqualified name behaves as intended.
- */
-TEST(TestUtilCxxname, getUnqualifiedName) {
-    // Check with no qualification
-    EXPECT_EQ(flamegpu::util::detail::cxxname::getUnqualifiedName(std::string("ClassName")), std::string("ClassName"));
-    // Check with one qualifier
-    EXPECT_EQ(flamegpu::util::detail::cxxname::getUnqualifiedName(std::string("namespace::ClassName")), std::string("ClassName"));
-
-    // Check with two qualifiers
-    EXPECT_EQ(flamegpu::util::detail::cxxname::getUnqualifiedName(std::string("namespace::subnamespace::ClassName")), std::string("ClassName"));
-
-    // Check with const char * as an argument
-    EXPECT_EQ(flamegpu::util::detail::cxxname::getUnqualifiedName("namespace::ClassName"), "ClassName");
-}
-
-}  // namespace test_cxxname
diff --git a/tests/test_cases/util/test_nvtx.cu b/tests/test_cases/util/test_nvtx.cu
index e528f92d9..275bc1b5c 100644
--- a/tests/test_cases/util/test_nvtx.cu
+++ b/tests/test_cases/util/test_nvtx.cu
@@ -1,5 +1,5 @@
 #include "flamegpu/util/nvtx.h"
-#include "flamegpu/gpu/detail/CUDAErrorChecking.cuh"
+#include "flamegpu/simulation/detail/CUDAErrorChecking.cuh"
 
 #include "gtest/gtest.h"
 

From 479afe45f88e4acb386de3e62f1f063ff1d173ee Mon Sep 17 00:00:00 2001
From: Robert Chisholm <robadob@robadob.org>
Date: Mon, 12 Dec 2022 08:59:03 +0000
Subject: [PATCH 2/2] Breaking Change: Removed
 CUDASimulation::getAgent()/getCUDAAgent()/getCUDAMessage()

These features were not intended for release, users should not have direct access to internal objects (detail namespace). Roughly equivalent functionality is available via DeviceAgentVector within the HostAPI.

fixup
---
 include/flamegpu/simulation/CUDASimulation.h  | 27 ++++++++++-------
 include/flamegpu/simulation/Simulation.h      |  1 -
 src/flamegpu/runtime/HostAPI.cu               |  2 +-
 src/flamegpu/simulation/CUDASimulation.cu     | 16 ----------
 .../simulation/test_cuda_simulation.cu        | 30 -------------------
 5 files changed, 17 insertions(+), 59 deletions(-)

diff --git a/include/flamegpu/simulation/CUDASimulation.h b/include/flamegpu/simulation/CUDASimulation.h
index b7fdf8757..eca346b8e 100644
--- a/include/flamegpu/simulation/CUDASimulation.h
+++ b/include/flamegpu/simulation/CUDASimulation.h
@@ -53,9 +53,17 @@ class CUDASimulation : public Simulation {
      * Requires internal access to scan/scatter singletons
      */
     friend class HostAgentAPI;
+    friend class HostAPI;
+    /**
+     * Requires internal access to getCUDAAgent()
+     */
     friend class detail::SimRunner;
     friend class CUDAEnsemble;
 #ifdef FLAMEGPU_VISUALISATION
+    /**
+     * Requires internal access to getCUDAAgent()
+     */
+    friend class visualiser::ModelVis;
     friend struct visualiser::ModelVisData;
 #endif
     /**
@@ -250,17 +258,6 @@ class CUDASimulation : public Simulation {
     template<typename T>
     std::vector<T> getEnvironmentPropertyArray(const std::string& property_name);
 #endif
-    /**
-     * Returns the manager for the specified agent
-     * @todo remove? this is mostly internal methods that modeller doesn't need access to
-     */
-    detail::CUDAAgent& getCUDAAgent(const std::string &agent_name) const;
-    detail::AgentInterface &getAgent(const std::string &name) override;
-    /**
-     * Returns the manager for the specified agent
-     * @todo remove? this is mostly internal methods that modeller doesn't need access to
-     */
-    detail::CUDAMessage& getCUDAMessage(const std::string &message_name) const;
     /**
      * @return A mutable reference to the cuda model specific configuration struct
      * @see Simulation::applyConfig() Should be called afterwards to apply changes
@@ -372,6 +369,14 @@ class CUDASimulation : public Simulation {
     void printHelp_derived() override;
 
  private:
+    /**
+     * Returns the manager for the specified agent
+     */
+    detail::CUDAAgent& getCUDAAgent(const std::string& agent_name) const;
+    /**
+     * Returns the manager for the specified message
+     */
+    detail::CUDAMessage& getCUDAMessage(const std::string& message_name) const;
     /**
      * Reinitalises random generation for this model and all submodels
      * @param seed New random seed (this updates stored seed in config)
diff --git a/include/flamegpu/simulation/Simulation.h b/include/flamegpu/simulation/Simulation.h
index cbdc069c2..74a279c33 100644
--- a/include/flamegpu/simulation/Simulation.h
+++ b/include/flamegpu/simulation/Simulation.h
@@ -123,7 +123,6 @@ class Simulation {
     virtual void getPopulationData(AgentVector& population, const std::string& state_name = ModelData::DEFAULT_STATE) = 0;
 
     virtual const RunLog &getRunLog() const = 0;
-    virtual detail::AgentInterface &getAgent(const std::string &name) = 0;
 
     Config &SimulationConfig();
     const Config &getSimulationConfig() const;
diff --git a/src/flamegpu/runtime/HostAPI.cu b/src/flamegpu/runtime/HostAPI.cu
index 00f75c115..3534d81b3 100644
--- a/src/flamegpu/runtime/HostAPI.cu
+++ b/src/flamegpu/runtime/HostAPI.cu
@@ -45,7 +45,7 @@ HostAgentAPI HostAPI::agent(const std::string &agent_name, const std::string &st
     if (state == agt->second.end()) {
         THROW exception::InvalidAgentState("Agent '%s' in model description hierarchy does not contain state '%s'.\n", agent_name.c_str(), state_name.c_str());
     }
-    return HostAgentAPI(*this, agentModel.getAgent(agent_name), state_name, agentOffsets.at(agent_name), state->second);
+    return HostAgentAPI(*this, agentModel.getCUDAAgent(agent_name), state_name, agentOffsets.at(agent_name), state->second);
 }
 
 /**
diff --git a/src/flamegpu/simulation/CUDASimulation.cu b/src/flamegpu/simulation/CUDASimulation.cu
index 98a00c807..d48bfcc61 100644
--- a/src/flamegpu/simulation/CUDASimulation.cu
+++ b/src/flamegpu/simulation/CUDASimulation.cu
@@ -1379,7 +1379,6 @@ void CUDASimulation::getPopulationData(AgentVector& population, const std::strin
     it->second->getPopulationData(population, state_name);
     gpuErrchk(cudaDeviceSynchronize());
 }
-
 detail::CUDAAgent& CUDASimulation::getCUDAAgent(const std::string& agent_name) const {
     CUDAAgentMap::const_iterator it;
     it = agent_map.find(agent_name);
@@ -1391,21 +1390,6 @@ detail::CUDAAgent& CUDASimulation::getCUDAAgent(const std::string& agent_name) c
 
     return *(it->second);
 }
-
-detail::AgentInterface& CUDASimulation::getAgent(const std::string& agent_name) {
-    // Ensure singletons have been initialised
-    initialiseSingletons();
-
-    auto it = agent_map.find(agent_name);
-
-    if (it == agent_map.end()) {
-        THROW exception::InvalidCudaAgent("CUDA agent ('%s') not found, in CUDASimulation::getAgent().",
-            agent_name.c_str());
-    }
-
-    return *(it->second);
-}
-
 detail::CUDAMessage& CUDASimulation::getCUDAMessage(const std::string& message_name) const {
     CUDAMessageMap::const_iterator it;
     it = message_map.find(message_name);
diff --git a/tests/test_cases/simulation/test_cuda_simulation.cu b/tests/test_cases/simulation/test_cuda_simulation.cu
index 2bf17e28e..137b39eaf 100644
--- a/tests/test_cases/simulation/test_cuda_simulation.cu
+++ b/tests/test_cases/simulation/test_cuda_simulation.cu
@@ -291,36 +291,6 @@ TEST(TestCUDASimulation, SetGetPopulationData_InvalidAgent) {
     EXPECT_THROW(c.setPopulationData(pop), exception::InvalidAgent);
     EXPECT_THROW(c.getPopulationData(pop), exception::InvalidAgent);
 }
-TEST(TestCUDASimulation, GetAgent) {
-    ModelDescription m(MODEL_NAME);
-    AgentDescription a = m.newAgent(AGENT_NAME);
-    m.newLayer(LAYER_NAME).addAgentFunction(a.newFunction(FUNCTION_NAME, SetGetFn));
-    a.newVariable<int>(VARIABLE_NAME);
-    AgentVector pop(a, static_cast<unsigned int>(AGENT_COUNT));
-    for (int _i = 0; _i < AGENT_COUNT; ++_i) {
-        AgentVector::Agent i = pop[_i];
-        i.setVariable<int>(VARIABLE_NAME, _i);
-    }
-    CUDASimulation c(m);
-    c.SimulationConfig().steps = 1;
-    c.setPopulationData(pop);
-    c.simulate();
-    detail::AgentInterface &agent = c.getAgent(AGENT_NAME);
-    for (int _i = 0; _i < AGENT_COUNT; ++_i) {
-        int host = 0;
-        cudaMemcpy(&host, reinterpret_cast<int*>(agent.getStateVariablePtr(ModelData::DEFAULT_STATE, VARIABLE_NAME)) + _i, sizeof(int), cudaMemcpyDeviceToHost);
-        EXPECT_EQ(host, _i * MULTIPLIER);
-        host = _i * 2;
-        cudaMemcpy(reinterpret_cast<int*>(agent.getStateVariablePtr(ModelData::DEFAULT_STATE, VARIABLE_NAME)) + _i, &host, sizeof(int), cudaMemcpyHostToDevice);
-    }
-    c.simulate();
-    agent = c.getAgent(AGENT_NAME);
-    for (int _i = 0; _i < AGENT_COUNT; ++_i) {
-        int host = 0;
-        cudaMemcpy(&host, reinterpret_cast<int*>(agent.getStateVariablePtr(ModelData::DEFAULT_STATE, VARIABLE_NAME)) + _i, sizeof(int), cudaMemcpyDeviceToHost);
-        EXPECT_EQ(host, _i * 2 * MULTIPLIER);
-    }
-}
 
 TEST(TestCUDASimulation, Step) {
     // Test that step does a single step