FLAMEGPU · Robadob · May 18, 2022 · Apr 27, 2022
diff --git a/include/flamegpu/exception/FLAMEGPUException.h b/include/flamegpu/exception/FLAMEGPUException.h
@@ -411,6 +411,10 @@ DERIVED_FLAMEGPUException(InvalidDependencyGraph, "Agent function dependency gra
  * This should not occur if the shared ID matches ID_NOT_SET
  */
 DERIVED_FLAMEGPUException(AgentIDCollision, "Multiple agents of same type share an ID");
+/**
+ * Defines an error when runs fail during an ensemble's execution
+ */
+DERIVED_FLAMEGPUException(EnsembleError, "One of more runs failed during the ensemble's execution");
 
 }  // namespace exception
 }  // namespace flamegpu

diff --git a/include/flamegpu/gpu/CUDAEnsemble.h b/include/flamegpu/gpu/CUDAEnsemble.h
@@ -25,34 +25,47 @@ class CUDAEnsemble {
      * Execution config for running a CUDAEnsemble
      */
     struct EnsembleConfig {
-        // std::string in = "";
         /**
          * Directory to store output data (primarily logs)
+         * Defaults to "" (the working directory, no subdirectory)
          */
         std::string out_directory = "";
         /**
          * Output format
          * This must be a supported format e.g.: "json" or "xml"
+         * Defaults to "json"
          */
         std::string out_format = "json";
         /**
          * The maximum number of concurrent runs
+         * Defaults to 4
          */
         unsigned int concurrent_runs = 4;
         /**
          * The CUDA device ids of devices to be used
          * If this is left empty, all available devices will be used
+         * Defaults to empty set (all available devices)
          */
         std::set<int> devices;
         /**
          * If true progress logging to stdout will be suppressed
+         * Defaults to false
          */
         bool quiet = false;
         /**
          * If true, the total runtime for the ensemble will be printed to stdout at completion
          * This is independent of the EnsembleConfig::quiet
+         * Defaults to false
          */
         bool timing = false;
+        enum ErrorLevel { Off = 0, Slow = 1, Fast = 2 };
+        /**
+         * Off: Runs which fail do not cause an exception to be raised. Failed runs must be probed manually via checking the return value of calls to CUDAEnsemble::simulate()
+         * Slow: If any runs fail, an EnsembleException will be raised after all runs have been attempted, before CUDAEnsemble::simulate() returns.
+         * Fast: An EnsembleException will be raised as soon as a failed run is detected, cancelling remaining runs.
+         * Defaults to Slow
+         */
+        ErrorLevel error_level = Slow;
     };
     /**
      * Initialise CUDA Ensemble
@@ -70,10 +83,12 @@ class CUDAEnsemble {
 
     /**
      * Execute the ensemble of simulations.
-     * This call will block until all simulations have completed or MAX_ERRORS simulations exit with an error
+     * This call will normally block until all simulations have completed, however may exit early if an error occurs with the error_level configuration set to Fast 
      * @param plan The plan of individual runs to execute during the ensemble
+     * @return 0 on success, otherwise the number of runs which reported errors and failed
+     * @see CUDAEnsemble::EnsembleConfig::error_level
      */
-    void simulate(const RunPlanVector &plan);
+    unsigned int simulate(const RunPlanVector &plan);
 
     /**
      * @return A mutable reference to the ensemble configuration struct

diff --git a/include/flamegpu/model/EnvironmentDescription.h b/include/flamegpu/model/EnvironmentDescription.h
@@ -42,7 +42,7 @@ class EnvironmentDescription {
     friend class CUDASimulation;
 
     friend class SimRunner;
-    friend void CUDAEnsemble::simulate(const RunPlanVector &plans);
+    friend unsigned int CUDAEnsemble::simulate(const RunPlanVector &plans);
 
  public:
     /**

diff --git a/include/flamegpu/sim/LoggingConfig.h b/include/flamegpu/sim/LoggingConfig.h
@@ -38,7 +38,7 @@ class LoggingConfig {
     /**
      * Requires access to log_timing
      */
-    friend void CUDAEnsemble::simulate(const RunPlanVector& plans);
+    friend unsigned int CUDAEnsemble::simulate(const RunPlanVector& plans);
 
  public:
     /**

diff --git a/include/flamegpu/sim/RunPlanVector.h b/include/flamegpu/sim/RunPlanVector.h
@@ -25,7 +25,7 @@ class EnvironmentDescription;
 class RunPlanVector : private std::vector<RunPlan>  {
     friend class RunPlan;
     friend class SimRunner;
-    friend void CUDAEnsemble::simulate(const RunPlanVector &plans);
+    friend unsigned int CUDAEnsemble::simulate(const RunPlanVector &plans);
 
  public:
     /**

diff --git a/include/flamegpu/sim/SimRunner.h b/include/flamegpu/sim/SimRunner.h
@@ -8,6 +8,7 @@
 #include <condition_variable>
 #include <thread>
 #include <vector>
+#include <string>
 #include "flamegpu/sim/LogFrame.h"
 
 namespace flamegpu {
@@ -25,6 +26,12 @@ class RunPlanVector;
  */
 class SimRunner {
     friend class CUDAEnsemble;
+    struct ErrorDetail {
+        unsigned int run_id;
+        unsigned int device_id;
+        unsigned int runner_id;
+        std::string exception_string;
+    };
     /**
      * Constructor, creates and initialise a new SimRunner
      * @param _model A copy of the ModelDescription hierarchy for the RunPlanVector, this is used to create the CUDASimulation instances.
@@ -36,10 +43,12 @@ class SimRunner {
      * @param _device_id The GPU that all runs should execute on
      * @param _runner_id A unique index assigned to the runner
      * @param _verbose If true more information will be written to stdout
+     * @param _fail_fast If true, the SimRunner will kill other runners and throw an exception on error
      * @param run_logs Reference to the vector to store generate run logs
      * @param log_export_queue The queue of logs to exported to disk
      * @param log_export_queue_mutex This mutex must be locked to access log_export_queue
      * @param log_export_queue_cdn The condition is notified every time a log has been added to the queue
+     * @param fast_err_detail Structure to store error details on fast failure for main thread rethrow
      */
     SimRunner(const std::shared_ptr<const ModelData> _model,
         std::atomic<unsigned int> &_err_ct,
@@ -50,10 +59,12 @@ class SimRunner {
         int _device_id,
         unsigned int _runner_id,
         bool _verbose,
+        bool _fail_fast,
         std::vector<RunLog> &run_logs,
         std::queue<unsigned int> &log_export_queue,
         std::mutex &log_export_queue_mutex,
-        std::condition_variable &log_export_queue_cdn);
+        std::condition_variable &log_export_queue_cdn,
+        ErrorDetail &fast_err_detail);
     /**
      * Each sim runner takes it's own clone of model description hierarchy, so it can manipulate environment without conflict
      */
@@ -74,6 +85,10 @@ class SimRunner {
      * Flag for whether to print progress
      */
     const bool verbose;
+    /**
+     * Flag for whether the ensemble should throw an exception if it errors out
+     */
+    const bool fail_fast;
     /**
      * The thread which the SimRunner executes on
      */
@@ -119,6 +134,10 @@ class SimRunner {
      * The condition is notified every time a log has been added to the queue
      */
     std::condition_variable &log_export_queue_cdn;
+    /**
+     * If fail_fast is true, on error details will be stored here so an exception can be thrown from the main thread
+     */
+    ErrorDetail& fast_err_detail;
 };
 
 }  // namespace flamegpu

diff --git a/src/flamegpu/gpu/CUDAEnsemble.cu b/src/flamegpu/gpu/CUDAEnsemble.cu
@@ -35,7 +35,7 @@ CUDAEnsemble::~CUDAEnsemble() {
 
 
 
-void CUDAEnsemble::simulate(const RunPlanVector &plans) {
+unsigned int CUDAEnsemble::simulate(const RunPlanVector &plans) {
     // Validate that RunPlan model matches CUDAEnsemble model
     if (*plans.environment != this->model->environment->properties) {
         THROW exception::InvalidArgument("RunPlan is for a different ModelDescription, in CUDAEnsemble::simulate()");
@@ -111,6 +111,7 @@ void CUDAEnsemble::simulate(const RunPlanVector &plans) {
     std::queue<unsigned int> log_export_queue;
     std::mutex log_export_queue_mutex;
     std::condition_variable log_export_queue_cdn;
+    SimRunner::ErrorDetail fast_err_detail = {};
 
     // Init with placement new
     {
@@ -121,7 +122,11 @@ void CUDAEnsemble::simulate(const RunPlanVector &plans) {
         unsigned int i = 0;
         for (auto &d : devices) {
             for (unsigned int j = 0; j < config.concurrent_runs; ++j) {
-                new (&runners[i++]) SimRunner(model, err_ct, next_run, plans, step_log_config, exit_log_config, d, j, !config.quiet, run_logs, log_export_queue, log_export_queue_mutex, log_export_queue_cdn);
+                new (&runners[i++]) SimRunner(model, err_ct, next_run, plans,
+                    step_log_config, exit_log_config,
+                    d, j,
+                    !config.quiet, config.error_level == EnsembleConfig::Fast,
+                    run_logs, log_export_queue, log_export_queue_mutex, log_export_queue_cdn, fast_err_detail);
             }
         }
     }
@@ -174,6 +179,15 @@ void CUDAEnsemble::simulate(const RunPlanVector &plans) {
 
     // Free memory
     free(runners);
+
+    if (config.error_level == EnsembleConfig::Fast && err_ct.load()) {
+        THROW exception::EnsembleError("Run %u failed on device %d, thread %u with exception: \n%s\n",
+            fast_err_detail.run_id, fast_err_detail.device_id, fast_err_detail.runner_id, fast_err_detail.exception_string.c_str());
+    } else if (config.error_level == EnsembleConfig::Slow && err_ct.load()) {
+        THROW exception::EnsembleError("%u/%u runs failed!\n.", err_ct.load(), static_cast<unsigned int>(plans.size()));
+    }
+
+    return err_ct.load();
 }
 
 void CUDAEnsemble::initialise(int argc, const char** argv) {
@@ -273,6 +287,27 @@ int CUDAEnsemble::checkArgs(int argc, const char** argv) {
             config.timing = true;
             continue;
         }
+        // -e/--error, Specify the error level
+        if (arg.compare("--error") == 0 || arg.compare("-e") == 0) {
+            if (i + 1 >= argc) {
+                fprintf(stderr, "%s requires a trailing argument\n", arg.c_str());
+                return false;
+            }
+            std::string error_level_string = argv[++i];
+            // Shift the trailing arg to lower
+            std::transform(error_level_string.begin(), error_level_string.end(), error_level_string.begin(), [](unsigned char c) { return std::use_facet< std::ctype<char>>(std::locale()).tolower(c); });
+            if (error_level_string.compare("off") == 0 || error_level_string.compare(std::to_string(EnsembleConfig::Off)) == 0) {
+                config.error_level = EnsembleConfig::Off;
+            } else if (error_level_string.compare("slow") == 0 || error_level_string.compare(std::to_string(EnsembleConfig::Slow)) == 0) {
+                config.error_level = EnsembleConfig::Slow;
+            } else if (error_level_string.compare("fast") == 0 || error_level_string.compare(std::to_string(EnsembleConfig::Fast)) == 0) {
+                config.error_level = EnsembleConfig::Fast;
+            } else {
+                fprintf(stderr, "%s is not an appropriate argument for %s\n", error_level_string.c_str(), arg.c_str());
+                return false;
+            }
+            continue;
+        }
         fprintf(stderr, "Unexpected argument: %s\n", arg.c_str());
         printHelp(argv[0]);
         return false;
@@ -292,6 +327,8 @@ void CUDAEnsemble::printHelp(const char *executable) {
     printf(line_fmt, "-o, --out <directory> <filetype>", "Directory and filetype for ensemble outputs");
     printf(line_fmt, "-q, --quiet", "Don't print progress information to console");
     printf(line_fmt, "-t, --timing", "Output timing information to stdout");
+    printf(line_fmt, "-e, --error <error level>", "The error level 0, 1, 2, off, slow or fast");
+    printf(line_fmt, "", "By default, \"slow\" will be used.");
 }
 void CUDAEnsemble::setStepLog(const StepLoggingConfig &stepConfig) {
     // Validate ModelDescription matches

diff --git a/src/flamegpu/sim/SimRunner.cu b/src/flamegpu/sim/SimRunner.cu
@@ -23,15 +23,18 @@ SimRunner::SimRunner(const std::shared_ptr<const ModelData> _model,
     int _device_id,
     unsigned int _runner_id,
     bool _verbose,
+    bool _fail_fast,
     std::vector<RunLog> &_run_logs,
     std::queue<unsigned int> &_log_export_queue,
     std::mutex &_log_export_queue_mutex,
-    std::condition_variable &_log_export_queue_cdn)
+    std::condition_variable &_log_export_queue_cdn,
+    ErrorDetail &_fast_err_detail)
       : model(_model->clone())
       , run_id(0)
       , device_id(_device_id)
       , runner_id(_runner_id)
       , verbose(_verbose)
+      , fail_fast(_fail_fast)
       , err_ct(_err_ct)
       , next_run(_next_run)
       , plans(_plans)
@@ -40,7 +43,8 @@ SimRunner::SimRunner(const std::shared_ptr<const ModelData> _model,
       , run_logs(_run_logs)
       , log_export_queue(_log_export_queue)
       , log_export_queue_mutex(_log_export_queue_mutex)
-      ,  log_export_queue_cdn(_log_export_queue_cdn) {
+      , log_export_queue_cdn(_log_export_queue_cdn)
+      , fast_err_detail(_fast_err_detail) {
     this->thread = std::thread(&SimRunner::start, this);
     // Attempt to name the thread
 #ifdef _MSC_VER
@@ -103,7 +107,27 @@ void SimRunner::start() {
                 fflush(stdout);
             }
         } catch(std::exception &e) {
-            fprintf(stderr, "\nRun %u failed on device %d, thread %u with exception: \n%s\n", run_id, device_id, runner_id, e.what());
+            ++err_ct;
+            if (this->fail_fast) {
+                // Kill the other workers early
+                next_run += static_cast<unsigned int>(plans.size());
+                {
+                    std::lock_guard<std::mutex> lck(log_export_queue_mutex);
+                    log_export_queue.push(UINT_MAX);
+                    // log_export_mutex is treated as our protection for race conditions on fast_err_detail
+                    fast_err_detail.run_id = run_id;
+                    fast_err_detail.device_id = device_id;
+                    fast_err_detail.runner_id = runner_id;
+                    fast_err_detail.exception_string = e.what();
+                }
+                return;
+            } else {
+                if (verbose) {
+                    fprintf(stdout, "\n");
+                    fflush(stdout);
+                }
+                fprintf(stderr, "Run %u failed on device %d, thread %u with exception: \n%s\n", run_id, device_id, runner_id, e.what());
+            }
         }
     }
 }
-Original file line number
+Diff line change
@@ Expand Up / @@ -38,7 +38,7 @@ class LoggingConfig { @@
         /**
          * Requires access to log_timing
          */
-        friend void CUDAEnsemble::simulate(const RunPlanVector& plans);
+        friend unsigned int CUDAEnsemble::simulate(const RunPlanVector& plans);
      public:
         /**
@@ Expand Down @@