From bd7b2db0a947ee9451132eb7ab8134302a5be616 Mon Sep 17 00:00:00 2001
From: Robert Chisholm <robadob@robadob.org>
Date: Wed, 27 Apr 2022 14:10:51 +0100
Subject: [PATCH] Improve CUDAEnsemble's error reporting. With 3 additional
 tests that the 3 levels of reporting all work.

---
 .../flamegpu/exception/FLAMEGPUException.h    |   4 +
 include/flamegpu/gpu/CUDAEnsemble.h           |  10 +-
 .../flamegpu/model/EnvironmentDescription.h   |   2 +-
 include/flamegpu/sim/LoggingConfig.h          |   2 +-
 include/flamegpu/sim/RunPlanVector.h          |   2 +-
 include/flamegpu/sim/SimRunner.h              |  21 +++-
 src/flamegpu/gpu/CUDAEnsemble.cu              |  40 ++++++-
 src/flamegpu/sim/SimRunner.cu                 |  30 ++++-
 tests/test_cases/gpu/test_cuda_ensemble.cu    | 106 ++++++++++++++++++
 9 files changed, 207 insertions(+), 10 deletions(-)

diff --git a/include/flamegpu/exception/FLAMEGPUException.h b/include/flamegpu/exception/FLAMEGPUException.h
index d9c371cc8..e31396278 100644
--- a/include/flamegpu/exception/FLAMEGPUException.h
+++ b/include/flamegpu/exception/FLAMEGPUException.h
@@ -411,6 +411,10 @@ DERIVED_FLAMEGPUException(InvalidDependencyGraph, "Agent function dependency gra
  * This should not occur if the shared ID matches ID_NOT_SET
  */
 DERIVED_FLAMEGPUException(AgentIDCollision, "Multiple agents of same type share an ID");
+/**
+ * Defines an error when runs fail during an ensemble's execution
+ */
+DERIVED_FLAMEGPUException(EnsembleError, "One of more runs failed during the ensemble's execution");
 
 }  // namespace exception
 }  // namespace flamegpu
diff --git a/include/flamegpu/gpu/CUDAEnsemble.h b/include/flamegpu/gpu/CUDAEnsemble.h
index faf103dec..c97994d5b 100644
--- a/include/flamegpu/gpu/CUDAEnsemble.h
+++ b/include/flamegpu/gpu/CUDAEnsemble.h
@@ -53,6 +53,13 @@ class CUDAEnsemble {
          * This is independent of the EnsembleConfig::quiet
          */
         bool timing = false;
+        enum ErrorLevel { Off = 0, Slow = 1, Fast = 2 };
+        /**
+         * Off: Runs which fail do not cause an exception to be raised. Failed runs must be probed manually via checking the return value of calls to CUDAEnsemble::simulate()
+         * Slow: If any runs fail, an EnsembleException will be raised after all runs have been attempted, before CUDAEnsemble::simulate() returns.
+         * Fast: An EnsembleException will be raised as soon as a failed run is detected, cancelling remaining runs.
+         */
+        ErrorLevel error_level = Slow;
     };
     /**
      * Initialise CUDA Ensemble
@@ -72,8 +79,9 @@ class CUDAEnsemble {
      * Execute the ensemble of simulations.
      * This call will block until all simulations have completed or MAX_ERRORS simulations exit with an error
      * @param plan The plan of individual runs to execute during the ensemble
+     * @return 0 on success, otherwise the number of runs which reported errors and failed
      */
-    void simulate(const RunPlanVector &plan);
+    unsigned int simulate(const RunPlanVector &plan);
 
     /**
      * @return A mutable reference to the ensemble configuration struct
diff --git a/include/flamegpu/model/EnvironmentDescription.h b/include/flamegpu/model/EnvironmentDescription.h
index 686c32ff8..b9fe93388 100644
--- a/include/flamegpu/model/EnvironmentDescription.h
+++ b/include/flamegpu/model/EnvironmentDescription.h
@@ -42,7 +42,7 @@ class EnvironmentDescription {
     friend class CUDASimulation;
 
     friend class SimRunner;
-    friend void CUDAEnsemble::simulate(const RunPlanVector &plans);
+    friend unsigned int CUDAEnsemble::simulate(const RunPlanVector &plans);
 
  public:
     /**
diff --git a/include/flamegpu/sim/LoggingConfig.h b/include/flamegpu/sim/LoggingConfig.h
index 2bd254f7f..c19f652b8 100644
--- a/include/flamegpu/sim/LoggingConfig.h
+++ b/include/flamegpu/sim/LoggingConfig.h
@@ -38,7 +38,7 @@ class LoggingConfig {
     /**
      * Requires access to log_timing
      */
-    friend void CUDAEnsemble::simulate(const RunPlanVector& plans);
+    friend unsigned int CUDAEnsemble::simulate(const RunPlanVector& plans);
 
  public:
     /**
diff --git a/include/flamegpu/sim/RunPlanVector.h b/include/flamegpu/sim/RunPlanVector.h
index 3df0e95d6..b25cecbef 100644
--- a/include/flamegpu/sim/RunPlanVector.h
+++ b/include/flamegpu/sim/RunPlanVector.h
@@ -25,7 +25,7 @@ class EnvironmentDescription;
 class RunPlanVector : private std::vector<RunPlan>  {
     friend class RunPlan;
     friend class SimRunner;
-    friend void CUDAEnsemble::simulate(const RunPlanVector &plans);
+    friend unsigned int CUDAEnsemble::simulate(const RunPlanVector &plans);
 
  public:
     /**
diff --git a/include/flamegpu/sim/SimRunner.h b/include/flamegpu/sim/SimRunner.h
index 97496055f..19f47f08b 100644
--- a/include/flamegpu/sim/SimRunner.h
+++ b/include/flamegpu/sim/SimRunner.h
@@ -8,6 +8,7 @@
 #include <condition_variable>
 #include <thread>
 #include <vector>
+#include <string>
 #include "flamegpu/sim/LogFrame.h"
 
 namespace flamegpu {
@@ -25,6 +26,12 @@ class RunPlanVector;
  */
 class SimRunner {
     friend class CUDAEnsemble;
+    struct ErrorDetail {
+        unsigned int run_id;
+        unsigned int device_id;
+        unsigned int runner_id;
+        std::string exception_string;
+    };
     /**
      * Constructor, creates and initialise a new SimRunner
      * @param _model A copy of the ModelDescription hierarchy for the RunPlanVector, this is used to create the CUDASimulation instances.
@@ -36,10 +43,12 @@ class SimRunner {
      * @param _device_id The GPU that all runs should execute on
      * @param _runner_id A unique index assigned to the runner
      * @param _verbose If true more information will be written to stdout
+     * @param _fail_fast If true, the SimRunner will kill other runners and throw an exception on error
      * @param run_logs Reference to the vector to store generate run logs
      * @param log_export_queue The queue of logs to exported to disk
      * @param log_export_queue_mutex This mutex must be locked to access log_export_queue
      * @param log_export_queue_cdn The condition is notified every time a log has been added to the queue
+     * @param fast_err_detail Structure to store error details on fast failure for main thread rethrow
      */
     SimRunner(const std::shared_ptr<const ModelData> _model,
         std::atomic<unsigned int> &_err_ct,
@@ -50,10 +59,12 @@ class SimRunner {
         int _device_id,
         unsigned int _runner_id,
         bool _verbose,
+        bool _fail_fast,
         std::vector<RunLog> &run_logs,
         std::queue<unsigned int> &log_export_queue,
         std::mutex &log_export_queue_mutex,
-        std::condition_variable &log_export_queue_cdn);
+        std::condition_variable &log_export_queue_cdn,
+        ErrorDetail &fast_err_detail);
     /**
      * Each sim runner takes it's own clone of model description hierarchy, so it can manipulate environment without conflict
      */
@@ -74,6 +85,10 @@ class SimRunner {
      * Flag for whether to print progress
      */
     const bool verbose;
+    /**
+     * Flag for whether the ensemble should throw an exception if it errors out
+     */
+    const bool fail_fast;
     /**
      * The thread which the SimRunner executes on
      */
@@ -119,6 +134,10 @@ class SimRunner {
      * The condition is notified every time a log has been added to the queue
      */
     std::condition_variable &log_export_queue_cdn;
+    /**
+     * If fail_fast is true, on error details will be stored here so an exception can be thrown from the main thread
+     */
+    ErrorDetail& fast_err_detail;
 };
 
 }  // namespace flamegpu
diff --git a/src/flamegpu/gpu/CUDAEnsemble.cu b/src/flamegpu/gpu/CUDAEnsemble.cu
index a90f900de..4dbbeee2c 100644
--- a/src/flamegpu/gpu/CUDAEnsemble.cu
+++ b/src/flamegpu/gpu/CUDAEnsemble.cu
@@ -35,7 +35,7 @@ CUDAEnsemble::~CUDAEnsemble() {
 
 
 
-void CUDAEnsemble::simulate(const RunPlanVector &plans) {
+unsigned int CUDAEnsemble::simulate(const RunPlanVector &plans) {
     // Validate that RunPlan model matches CUDAEnsemble model
     if (*plans.environment != this->model->environment->properties) {
         THROW exception::InvalidArgument("RunPlan is for a different ModelDescription, in CUDAEnsemble::simulate()");
@@ -111,6 +111,7 @@ void CUDAEnsemble::simulate(const RunPlanVector &plans) {
     std::queue<unsigned int> log_export_queue;
     std::mutex log_export_queue_mutex;
     std::condition_variable log_export_queue_cdn;
+    SimRunner::ErrorDetail fast_err_detail = {};
 
     // Init with placement new
     {
@@ -121,7 +122,11 @@ void CUDAEnsemble::simulate(const RunPlanVector &plans) {
         unsigned int i = 0;
         for (auto &d : devices) {
             for (unsigned int j = 0; j < config.concurrent_runs; ++j) {
-                new (&runners[i++]) SimRunner(model, err_ct, next_run, plans, step_log_config, exit_log_config, d, j, !config.quiet, run_logs, log_export_queue, log_export_queue_mutex, log_export_queue_cdn);
+                new (&runners[i++]) SimRunner(model, err_ct, next_run, plans,
+                    step_log_config, exit_log_config,
+                    d, j,
+                    !config.quiet, config.error_level == EnsembleConfig::Fast,
+                    run_logs, log_export_queue, log_export_queue_mutex, log_export_queue_cdn, fast_err_detail);
             }
         }
     }
@@ -174,6 +179,15 @@ void CUDAEnsemble::simulate(const RunPlanVector &plans) {
 
     // Free memory
     free(runners);
+
+    if (config.error_level == EnsembleConfig::Fast && err_ct.load()) {
+        THROW exception::EnsembleError("Run %u failed on device %d, thread %u with exception: \n%s\n",
+            fast_err_detail.run_id, fast_err_detail.device_id, fast_err_detail.runner_id, fast_err_detail.exception_string.c_str());
+    } else if (config.error_level == EnsembleConfig::Slow && err_ct.load()) {
+        THROW exception::EnsembleError("%u/%u runs failed!\n.", err_ct.load(), static_cast<unsigned int>(plans.size()));
+    }
+
+    return err_ct.load();
 }
 
 void CUDAEnsemble::initialise(int argc, const char** argv) {
@@ -273,6 +287,27 @@ int CUDAEnsemble::checkArgs(int argc, const char** argv) {
             config.timing = true;
             continue;
         }
+        // -e/--error, Specify the error level
+        if (arg.compare("--error") == 0 || arg.compare("-e") == 0) {
+            if (i + 1 >= argc) {
+                fprintf(stderr, "%s requires a trailing argument\n", arg.c_str());
+                return false;
+            }
+            std::string error_level_string = argv[++i];
+            // Shift the trailing arg to lower
+            std::transform(error_level_string.begin(), error_level_string.end(), error_level_string.begin(), [](unsigned char c) { return std::use_facet< std::ctype<char>>(std::locale()).tolower(c); });
+            if (arg.compare("off") == 0 || arg.compare(std::to_string(EnsembleConfig::Off)) == 0) {
+                config.error_level = EnsembleConfig::Off;
+            } else if (arg.compare("slow") == 0 || arg.compare(std::to_string(EnsembleConfig::Slow)) == 0) {
+                config.error_level = EnsembleConfig::Slow;
+            } else if (arg.compare("fast") == 0 || arg.compare(std::to_string(EnsembleConfig::Fast)) == 0) {
+                config.error_level = EnsembleConfig::Fast;
+            } else {
+                fprintf(stderr, "%s is not an appropriate argument for %s\n", error_level_string.c_str(), arg.c_str());
+                return false;
+            }
+            continue;
+        }
         fprintf(stderr, "Unexpected argument: %s\n", arg.c_str());
         printHelp(argv[0]);
         return false;
@@ -292,6 +327,7 @@ void CUDAEnsemble::printHelp(const char *executable) {
     printf(line_fmt, "-o, --out <directory> <filetype>", "Directory and filetype for ensemble outputs");
     printf(line_fmt, "-q, --quiet", "Don't print progress information to console");
     printf(line_fmt, "-t, --timing", "Output timing information to stdout");
+    printf(line_fmt, "-e, --error <error level>", "The error level 0, 1, 2, off, slow or fast");
 }
 void CUDAEnsemble::setStepLog(const StepLoggingConfig &stepConfig) {
     // Validate ModelDescription matches
diff --git a/src/flamegpu/sim/SimRunner.cu b/src/flamegpu/sim/SimRunner.cu
index f4633bd6c..c5982680f 100644
--- a/src/flamegpu/sim/SimRunner.cu
+++ b/src/flamegpu/sim/SimRunner.cu
@@ -23,15 +23,18 @@ SimRunner::SimRunner(const std::shared_ptr<const ModelData> _model,
     int _device_id,
     unsigned int _runner_id,
     bool _verbose,
+    bool _fail_fast,
     std::vector<RunLog> &_run_logs,
     std::queue<unsigned int> &_log_export_queue,
     std::mutex &_log_export_queue_mutex,
-    std::condition_variable &_log_export_queue_cdn)
+    std::condition_variable &_log_export_queue_cdn,
+    ErrorDetail &_fast_err_detail)
       : model(_model->clone())
       , run_id(0)
       , device_id(_device_id)
       , runner_id(_runner_id)
       , verbose(_verbose)
+      , fail_fast(_fail_fast)
       , err_ct(_err_ct)
       , next_run(_next_run)
       , plans(_plans)
@@ -40,7 +43,8 @@ SimRunner::SimRunner(const std::shared_ptr<const ModelData> _model,
       , run_logs(_run_logs)
       , log_export_queue(_log_export_queue)
       , log_export_queue_mutex(_log_export_queue_mutex)
-      ,  log_export_queue_cdn(_log_export_queue_cdn) {
+      , log_export_queue_cdn(_log_export_queue_cdn)
+      , fast_err_detail(_fast_err_detail) {
     this->thread = std::thread(&SimRunner::start, this);
     // Attempt to name the thread
 #ifdef _MSC_VER
@@ -103,7 +107,27 @@ void SimRunner::start() {
                 fflush(stdout);
             }
         } catch(std::exception &e) {
-            fprintf(stderr, "\nRun %u failed on device %d, thread %u with exception: \n%s\n", run_id, device_id, runner_id, e.what());
+            ++err_ct;
+            if (this->fail_fast) {
+                // Kill the other workers early
+                next_run += static_cast<unsigned int>(plans.size());
+                {
+                    std::lock_guard<std::mutex> lck(log_export_queue_mutex);
+                    log_export_queue.push(UINT_MAX);
+                    // log_export_mutex is treated as our protection for race conditions on fast_err_detail
+                    fast_err_detail.run_id = run_id;
+                    fast_err_detail.device_id = device_id;
+                    fast_err_detail.runner_id = runner_id;
+                    fast_err_detail.exception_string = e.what();
+                }
+                return;
+            } else {
+                if (verbose) {
+                    fprintf(stdout, "\n");
+                    fflush(stdout);
+                }
+                fprintf(stderr, "Run %u failed on device %d, thread %u with exception: \n%s\n", run_id, device_id, runner_id, e.what());
+            }
         }
     }
 }
diff --git a/tests/test_cases/gpu/test_cuda_ensemble.cu b/tests/test_cases/gpu/test_cuda_ensemble.cu
index 3c9d74db5..10a275e23 100644
--- a/tests/test_cases/gpu/test_cuda_ensemble.cu
+++ b/tests/test_cases/gpu/test_cuda_ensemble.cu
@@ -349,6 +349,112 @@ TEST(TestCUDAEnsemble, getEnsembleElapsedTime) {
     double threshold = sleepDurationSeconds * 0.8;
     EXPECT_GE(elapsedSeconds, threshold);
 }
+unsigned int tracked_err_ct;
+unsigned int tracked_runs_ct;
+FLAMEGPU_STEP_FUNCTION(throwException) {
+    ++tracked_runs_ct;
+    static int i = 0;
+    if (++i % 2 == 0) {
+        ++tracked_err_ct;
+        THROW exception::UnknownInternalError("Dummy Exception");
+    }
+}
+TEST(TestCUDAEnsemble, ErrorOff) {
+    tracked_err_ct = 0;
+    tracked_runs_ct = 0;
+    // Create a model containing atleast one agent type and function.
+    flamegpu::ModelDescription model("test");
+    // Environmental constant for initial population
+    model.Environment().newProperty<uint32_t>("POPULATION_TO_GENERATE", 1, true);
+    // Agent(s)
+    flamegpu::AgentDescription& agent = model.newAgent("Agent");
+    agent.newVariable<uint32_t>("counter", 0);
+    // Control flow
+    model.addInitFunction(elapsedInit);
+    model.addStepFunction(throwException);
+    // Create a set of 10 Run plans
+    const unsigned int ENSEMBLE_COUNT = 10;
+    auto plans = flamegpu::RunPlanVector(model, ENSEMBLE_COUNT);
+    plans.setSteps(1);
+    // Create an ensemble
+    flamegpu::CUDAEnsemble ensemble(model);
+    // Make it quiet to avoid outputting during the test suite
+    ensemble.Config().quiet = true;
+    ensemble.Config().out_format = "";  // Suppress warning
+    ensemble.Config().error_level = CUDAEnsemble::EnsembleConfig::Off;
+    ensemble.Config().concurrent_runs = 1;  // Single device/no concurrency to ensure we get consistent data
+    ensemble.Config().devices = {0};
+    unsigned int reported_err_ct = 0;
+    // Simulate the ensemble,
+    EXPECT_NO_THROW(reported_err_ct = ensemble.simulate(plans));
+    // Check correct number of fails is reported
+    EXPECT_EQ(reported_err_ct, ENSEMBLE_COUNT / 2);
+    EXPECT_EQ(tracked_err_ct, ENSEMBLE_COUNT / 2);
+    EXPECT_EQ(tracked_runs_ct, ENSEMBLE_COUNT);
+}
+TEST(TestCUDAEnsemble, ErrorSlow) {
+    tracked_err_ct = 0;
+    tracked_runs_ct = 0;
+    // Create a model containing atleast one agent type and function.
+    flamegpu::ModelDescription model("test");
+    // Environmental constant for initial population
+    model.Environment().newProperty<uint32_t>("POPULATION_TO_GENERATE", 1, true);
+    // Agent(s)
+    flamegpu::AgentDescription& agent = model.newAgent("Agent");
+    agent.newVariable<uint32_t>("counter", 0);
+    // Control flow
+    model.addInitFunction(elapsedInit);
+    model.addStepFunction(throwException);
+    // Create a set of 10 Run plans
+    const unsigned int ENSEMBLE_COUNT = 10;
+    auto plans = flamegpu::RunPlanVector(model, ENSEMBLE_COUNT);
+    plans.setSteps(1);
+    // Create an ensemble
+    flamegpu::CUDAEnsemble ensemble(model);
+    // Make it quiet to avoid outputting during the test suite
+    ensemble.Config().quiet = true;
+    ensemble.Config().out_format = "";  // Suppress warning
+    ensemble.Config().error_level = CUDAEnsemble::EnsembleConfig::Slow;
+    ensemble.Config().concurrent_runs = 1;  // Single device/no concurrency to ensure we get consistent data
+    ensemble.Config().devices = { 0 };
+    // Simulate the ensemble,
+    EXPECT_THROW(ensemble.simulate(plans), exception::EnsembleError);
+    // Check correct number of fails occurred (Unable to retrieve actual error count except from stderr with SLOW)
+    EXPECT_EQ(tracked_err_ct, ENSEMBLE_COUNT / 2);
+    EXPECT_EQ(tracked_runs_ct, ENSEMBLE_COUNT);
+}
+TEST(TestCUDAEnsemble, ErrorFast) {
+    tracked_err_ct = 0;
+    tracked_runs_ct = 0;
+    // Create a model containing atleast one agent type and function.
+    flamegpu::ModelDescription model("test");
+    // Environmental constant for initial population
+    model.Environment().newProperty<uint32_t>("POPULATION_TO_GENERATE", 1, true);
+    // Agent(s)
+    flamegpu::AgentDescription& agent = model.newAgent("Agent");
+    agent.newVariable<uint32_t>("counter", 0);
+    // Control flow
+    model.addInitFunction(elapsedInit);
+    model.addStepFunction(throwException);
+    // Create a set of 10 Run plans
+    const unsigned int ENSEMBLE_COUNT = 10;
+    auto plans = flamegpu::RunPlanVector(model, ENSEMBLE_COUNT);
+    plans.setSteps(1);
+    // Create an ensemble
+    flamegpu::CUDAEnsemble ensemble(model);
+    // Make it quiet to avoid outputting during the test suite
+    ensemble.Config().quiet = true;
+    ensemble.Config().out_format = "";  // Suppress warning
+    ensemble.Config().error_level = CUDAEnsemble::EnsembleConfig::Fast;
+    ensemble.Config().concurrent_runs = 1;  // Single device/no concurrency to ensure we get consistent data
+    ensemble.Config().devices = { 0 };
+    // Simulate the ensemble,
+    EXPECT_THROW(ensemble.simulate(plans), exception::EnsembleError);
+    // Check correct number of fails occurred (Fast kills ensemble as soon as first error occurs)
+    EXPECT_EQ(tracked_err_ct, 1u);
+    // The first run does not throw
+    EXPECT_EQ(tracked_runs_ct, 2u);
+}
 
 }  // namespace test_cuda_ensemble
 }  // namespace tests