From bd7b2db0a947ee9451132eb7ab8134302a5be616 Mon Sep 17 00:00:00 2001 From: Robert Chisholm Date: Wed, 27 Apr 2022 14:10:51 +0100 Subject: [PATCH] Improve CUDAEnsemble's error reporting. With 3 additional tests that the 3 levels of reporting all work. --- .../flamegpu/exception/FLAMEGPUException.h | 4 + include/flamegpu/gpu/CUDAEnsemble.h | 10 +- .../flamegpu/model/EnvironmentDescription.h | 2 +- include/flamegpu/sim/LoggingConfig.h | 2 +- include/flamegpu/sim/RunPlanVector.h | 2 +- include/flamegpu/sim/SimRunner.h | 21 +++- src/flamegpu/gpu/CUDAEnsemble.cu | 40 ++++++- src/flamegpu/sim/SimRunner.cu | 30 ++++- tests/test_cases/gpu/test_cuda_ensemble.cu | 106 ++++++++++++++++++ 9 files changed, 207 insertions(+), 10 deletions(-) diff --git a/include/flamegpu/exception/FLAMEGPUException.h b/include/flamegpu/exception/FLAMEGPUException.h index d9c371cc8..e31396278 100644 --- a/include/flamegpu/exception/FLAMEGPUException.h +++ b/include/flamegpu/exception/FLAMEGPUException.h @@ -411,6 +411,10 @@ DERIVED_FLAMEGPUException(InvalidDependencyGraph, "Agent function dependency gra * This should not occur if the shared ID matches ID_NOT_SET */ DERIVED_FLAMEGPUException(AgentIDCollision, "Multiple agents of same type share an ID"); +/** + * Defines an error when runs fail during an ensemble's execution + */ +DERIVED_FLAMEGPUException(EnsembleError, "One of more runs failed during the ensemble's execution"); } // namespace exception } // namespace flamegpu diff --git a/include/flamegpu/gpu/CUDAEnsemble.h b/include/flamegpu/gpu/CUDAEnsemble.h index faf103dec..c97994d5b 100644 --- a/include/flamegpu/gpu/CUDAEnsemble.h +++ b/include/flamegpu/gpu/CUDAEnsemble.h @@ -53,6 +53,13 @@ class CUDAEnsemble { * This is independent of the EnsembleConfig::quiet */ bool timing = false; + enum ErrorLevel { Off = 0, Slow = 1, Fast = 2 }; + /** + * Off: Runs which fail do not cause an exception to be raised. Failed runs must be probed manually via checking the return value of calls to CUDAEnsemble::simulate() + * Slow: If any runs fail, an EnsembleException will be raised after all runs have been attempted, before CUDAEnsemble::simulate() returns. + * Fast: An EnsembleException will be raised as soon as a failed run is detected, cancelling remaining runs. + */ + ErrorLevel error_level = Slow; }; /** * Initialise CUDA Ensemble @@ -72,8 +79,9 @@ class CUDAEnsemble { * Execute the ensemble of simulations. * This call will block until all simulations have completed or MAX_ERRORS simulations exit with an error * @param plan The plan of individual runs to execute during the ensemble + * @return 0 on success, otherwise the number of runs which reported errors and failed */ - void simulate(const RunPlanVector &plan); + unsigned int simulate(const RunPlanVector &plan); /** * @return A mutable reference to the ensemble configuration struct diff --git a/include/flamegpu/model/EnvironmentDescription.h b/include/flamegpu/model/EnvironmentDescription.h index 686c32ff8..b9fe93388 100644 --- a/include/flamegpu/model/EnvironmentDescription.h +++ b/include/flamegpu/model/EnvironmentDescription.h @@ -42,7 +42,7 @@ class EnvironmentDescription { friend class CUDASimulation; friend class SimRunner; - friend void CUDAEnsemble::simulate(const RunPlanVector &plans); + friend unsigned int CUDAEnsemble::simulate(const RunPlanVector &plans); public: /** diff --git a/include/flamegpu/sim/LoggingConfig.h b/include/flamegpu/sim/LoggingConfig.h index 2bd254f7f..c19f652b8 100644 --- a/include/flamegpu/sim/LoggingConfig.h +++ b/include/flamegpu/sim/LoggingConfig.h @@ -38,7 +38,7 @@ class LoggingConfig { /** * Requires access to log_timing */ - friend void CUDAEnsemble::simulate(const RunPlanVector& plans); + friend unsigned int CUDAEnsemble::simulate(const RunPlanVector& plans); public: /** diff --git a/include/flamegpu/sim/RunPlanVector.h b/include/flamegpu/sim/RunPlanVector.h index 3df0e95d6..b25cecbef 100644 --- a/include/flamegpu/sim/RunPlanVector.h +++ b/include/flamegpu/sim/RunPlanVector.h @@ -25,7 +25,7 @@ class EnvironmentDescription; class RunPlanVector : private std::vector { friend class RunPlan; friend class SimRunner; - friend void CUDAEnsemble::simulate(const RunPlanVector &plans); + friend unsigned int CUDAEnsemble::simulate(const RunPlanVector &plans); public: /** diff --git a/include/flamegpu/sim/SimRunner.h b/include/flamegpu/sim/SimRunner.h index 97496055f..19f47f08b 100644 --- a/include/flamegpu/sim/SimRunner.h +++ b/include/flamegpu/sim/SimRunner.h @@ -8,6 +8,7 @@ #include #include #include +#include #include "flamegpu/sim/LogFrame.h" namespace flamegpu { @@ -25,6 +26,12 @@ class RunPlanVector; */ class SimRunner { friend class CUDAEnsemble; + struct ErrorDetail { + unsigned int run_id; + unsigned int device_id; + unsigned int runner_id; + std::string exception_string; + }; /** * Constructor, creates and initialise a new SimRunner * @param _model A copy of the ModelDescription hierarchy for the RunPlanVector, this is used to create the CUDASimulation instances. @@ -36,10 +43,12 @@ class SimRunner { * @param _device_id The GPU that all runs should execute on * @param _runner_id A unique index assigned to the runner * @param _verbose If true more information will be written to stdout + * @param _fail_fast If true, the SimRunner will kill other runners and throw an exception on error * @param run_logs Reference to the vector to store generate run logs * @param log_export_queue The queue of logs to exported to disk * @param log_export_queue_mutex This mutex must be locked to access log_export_queue * @param log_export_queue_cdn The condition is notified every time a log has been added to the queue + * @param fast_err_detail Structure to store error details on fast failure for main thread rethrow */ SimRunner(const std::shared_ptr _model, std::atomic &_err_ct, @@ -50,10 +59,12 @@ class SimRunner { int _device_id, unsigned int _runner_id, bool _verbose, + bool _fail_fast, std::vector &run_logs, std::queue &log_export_queue, std::mutex &log_export_queue_mutex, - std::condition_variable &log_export_queue_cdn); + std::condition_variable &log_export_queue_cdn, + ErrorDetail &fast_err_detail); /** * Each sim runner takes it's own clone of model description hierarchy, so it can manipulate environment without conflict */ @@ -74,6 +85,10 @@ class SimRunner { * Flag for whether to print progress */ const bool verbose; + /** + * Flag for whether the ensemble should throw an exception if it errors out + */ + const bool fail_fast; /** * The thread which the SimRunner executes on */ @@ -119,6 +134,10 @@ class SimRunner { * The condition is notified every time a log has been added to the queue */ std::condition_variable &log_export_queue_cdn; + /** + * If fail_fast is true, on error details will be stored here so an exception can be thrown from the main thread + */ + ErrorDetail& fast_err_detail; }; } // namespace flamegpu diff --git a/src/flamegpu/gpu/CUDAEnsemble.cu b/src/flamegpu/gpu/CUDAEnsemble.cu index a90f900de..4dbbeee2c 100644 --- a/src/flamegpu/gpu/CUDAEnsemble.cu +++ b/src/flamegpu/gpu/CUDAEnsemble.cu @@ -35,7 +35,7 @@ CUDAEnsemble::~CUDAEnsemble() { -void CUDAEnsemble::simulate(const RunPlanVector &plans) { +unsigned int CUDAEnsemble::simulate(const RunPlanVector &plans) { // Validate that RunPlan model matches CUDAEnsemble model if (*plans.environment != this->model->environment->properties) { THROW exception::InvalidArgument("RunPlan is for a different ModelDescription, in CUDAEnsemble::simulate()"); @@ -111,6 +111,7 @@ void CUDAEnsemble::simulate(const RunPlanVector &plans) { std::queue log_export_queue; std::mutex log_export_queue_mutex; std::condition_variable log_export_queue_cdn; + SimRunner::ErrorDetail fast_err_detail = {}; // Init with placement new { @@ -121,7 +122,11 @@ void CUDAEnsemble::simulate(const RunPlanVector &plans) { unsigned int i = 0; for (auto &d : devices) { for (unsigned int j = 0; j < config.concurrent_runs; ++j) { - new (&runners[i++]) SimRunner(model, err_ct, next_run, plans, step_log_config, exit_log_config, d, j, !config.quiet, run_logs, log_export_queue, log_export_queue_mutex, log_export_queue_cdn); + new (&runners[i++]) SimRunner(model, err_ct, next_run, plans, + step_log_config, exit_log_config, + d, j, + !config.quiet, config.error_level == EnsembleConfig::Fast, + run_logs, log_export_queue, log_export_queue_mutex, log_export_queue_cdn, fast_err_detail); } } } @@ -174,6 +179,15 @@ void CUDAEnsemble::simulate(const RunPlanVector &plans) { // Free memory free(runners); + + if (config.error_level == EnsembleConfig::Fast && err_ct.load()) { + THROW exception::EnsembleError("Run %u failed on device %d, thread %u with exception: \n%s\n", + fast_err_detail.run_id, fast_err_detail.device_id, fast_err_detail.runner_id, fast_err_detail.exception_string.c_str()); + } else if (config.error_level == EnsembleConfig::Slow && err_ct.load()) { + THROW exception::EnsembleError("%u/%u runs failed!\n.", err_ct.load(), static_cast(plans.size())); + } + + return err_ct.load(); } void CUDAEnsemble::initialise(int argc, const char** argv) { @@ -273,6 +287,27 @@ int CUDAEnsemble::checkArgs(int argc, const char** argv) { config.timing = true; continue; } + // -e/--error, Specify the error level + if (arg.compare("--error") == 0 || arg.compare("-e") == 0) { + if (i + 1 >= argc) { + fprintf(stderr, "%s requires a trailing argument\n", arg.c_str()); + return false; + } + std::string error_level_string = argv[++i]; + // Shift the trailing arg to lower + std::transform(error_level_string.begin(), error_level_string.end(), error_level_string.begin(), [](unsigned char c) { return std::use_facet< std::ctype>(std::locale()).tolower(c); }); + if (arg.compare("off") == 0 || arg.compare(std::to_string(EnsembleConfig::Off)) == 0) { + config.error_level = EnsembleConfig::Off; + } else if (arg.compare("slow") == 0 || arg.compare(std::to_string(EnsembleConfig::Slow)) == 0) { + config.error_level = EnsembleConfig::Slow; + } else if (arg.compare("fast") == 0 || arg.compare(std::to_string(EnsembleConfig::Fast)) == 0) { + config.error_level = EnsembleConfig::Fast; + } else { + fprintf(stderr, "%s is not an appropriate argument for %s\n", error_level_string.c_str(), arg.c_str()); + return false; + } + continue; + } fprintf(stderr, "Unexpected argument: %s\n", arg.c_str()); printHelp(argv[0]); return false; @@ -292,6 +327,7 @@ void CUDAEnsemble::printHelp(const char *executable) { printf(line_fmt, "-o, --out ", "Directory and filetype for ensemble outputs"); printf(line_fmt, "-q, --quiet", "Don't print progress information to console"); printf(line_fmt, "-t, --timing", "Output timing information to stdout"); + printf(line_fmt, "-e, --error ", "The error level 0, 1, 2, off, slow or fast"); } void CUDAEnsemble::setStepLog(const StepLoggingConfig &stepConfig) { // Validate ModelDescription matches diff --git a/src/flamegpu/sim/SimRunner.cu b/src/flamegpu/sim/SimRunner.cu index f4633bd6c..c5982680f 100644 --- a/src/flamegpu/sim/SimRunner.cu +++ b/src/flamegpu/sim/SimRunner.cu @@ -23,15 +23,18 @@ SimRunner::SimRunner(const std::shared_ptr _model, int _device_id, unsigned int _runner_id, bool _verbose, + bool _fail_fast, std::vector &_run_logs, std::queue &_log_export_queue, std::mutex &_log_export_queue_mutex, - std::condition_variable &_log_export_queue_cdn) + std::condition_variable &_log_export_queue_cdn, + ErrorDetail &_fast_err_detail) : model(_model->clone()) , run_id(0) , device_id(_device_id) , runner_id(_runner_id) , verbose(_verbose) + , fail_fast(_fail_fast) , err_ct(_err_ct) , next_run(_next_run) , plans(_plans) @@ -40,7 +43,8 @@ SimRunner::SimRunner(const std::shared_ptr _model, , run_logs(_run_logs) , log_export_queue(_log_export_queue) , log_export_queue_mutex(_log_export_queue_mutex) - , log_export_queue_cdn(_log_export_queue_cdn) { + , log_export_queue_cdn(_log_export_queue_cdn) + , fast_err_detail(_fast_err_detail) { this->thread = std::thread(&SimRunner::start, this); // Attempt to name the thread #ifdef _MSC_VER @@ -103,7 +107,27 @@ void SimRunner::start() { fflush(stdout); } } catch(std::exception &e) { - fprintf(stderr, "\nRun %u failed on device %d, thread %u with exception: \n%s\n", run_id, device_id, runner_id, e.what()); + ++err_ct; + if (this->fail_fast) { + // Kill the other workers early + next_run += static_cast(plans.size()); + { + std::lock_guard lck(log_export_queue_mutex); + log_export_queue.push(UINT_MAX); + // log_export_mutex is treated as our protection for race conditions on fast_err_detail + fast_err_detail.run_id = run_id; + fast_err_detail.device_id = device_id; + fast_err_detail.runner_id = runner_id; + fast_err_detail.exception_string = e.what(); + } + return; + } else { + if (verbose) { + fprintf(stdout, "\n"); + fflush(stdout); + } + fprintf(stderr, "Run %u failed on device %d, thread %u with exception: \n%s\n", run_id, device_id, runner_id, e.what()); + } } } } diff --git a/tests/test_cases/gpu/test_cuda_ensemble.cu b/tests/test_cases/gpu/test_cuda_ensemble.cu index 3c9d74db5..10a275e23 100644 --- a/tests/test_cases/gpu/test_cuda_ensemble.cu +++ b/tests/test_cases/gpu/test_cuda_ensemble.cu @@ -349,6 +349,112 @@ TEST(TestCUDAEnsemble, getEnsembleElapsedTime) { double threshold = sleepDurationSeconds * 0.8; EXPECT_GE(elapsedSeconds, threshold); } +unsigned int tracked_err_ct; +unsigned int tracked_runs_ct; +FLAMEGPU_STEP_FUNCTION(throwException) { + ++tracked_runs_ct; + static int i = 0; + if (++i % 2 == 0) { + ++tracked_err_ct; + THROW exception::UnknownInternalError("Dummy Exception"); + } +} +TEST(TestCUDAEnsemble, ErrorOff) { + tracked_err_ct = 0; + tracked_runs_ct = 0; + // Create a model containing atleast one agent type and function. + flamegpu::ModelDescription model("test"); + // Environmental constant for initial population + model.Environment().newProperty("POPULATION_TO_GENERATE", 1, true); + // Agent(s) + flamegpu::AgentDescription& agent = model.newAgent("Agent"); + agent.newVariable("counter", 0); + // Control flow + model.addInitFunction(elapsedInit); + model.addStepFunction(throwException); + // Create a set of 10 Run plans + const unsigned int ENSEMBLE_COUNT = 10; + auto plans = flamegpu::RunPlanVector(model, ENSEMBLE_COUNT); + plans.setSteps(1); + // Create an ensemble + flamegpu::CUDAEnsemble ensemble(model); + // Make it quiet to avoid outputting during the test suite + ensemble.Config().quiet = true; + ensemble.Config().out_format = ""; // Suppress warning + ensemble.Config().error_level = CUDAEnsemble::EnsembleConfig::Off; + ensemble.Config().concurrent_runs = 1; // Single device/no concurrency to ensure we get consistent data + ensemble.Config().devices = {0}; + unsigned int reported_err_ct = 0; + // Simulate the ensemble, + EXPECT_NO_THROW(reported_err_ct = ensemble.simulate(plans)); + // Check correct number of fails is reported + EXPECT_EQ(reported_err_ct, ENSEMBLE_COUNT / 2); + EXPECT_EQ(tracked_err_ct, ENSEMBLE_COUNT / 2); + EXPECT_EQ(tracked_runs_ct, ENSEMBLE_COUNT); +} +TEST(TestCUDAEnsemble, ErrorSlow) { + tracked_err_ct = 0; + tracked_runs_ct = 0; + // Create a model containing atleast one agent type and function. + flamegpu::ModelDescription model("test"); + // Environmental constant for initial population + model.Environment().newProperty("POPULATION_TO_GENERATE", 1, true); + // Agent(s) + flamegpu::AgentDescription& agent = model.newAgent("Agent"); + agent.newVariable("counter", 0); + // Control flow + model.addInitFunction(elapsedInit); + model.addStepFunction(throwException); + // Create a set of 10 Run plans + const unsigned int ENSEMBLE_COUNT = 10; + auto plans = flamegpu::RunPlanVector(model, ENSEMBLE_COUNT); + plans.setSteps(1); + // Create an ensemble + flamegpu::CUDAEnsemble ensemble(model); + // Make it quiet to avoid outputting during the test suite + ensemble.Config().quiet = true; + ensemble.Config().out_format = ""; // Suppress warning + ensemble.Config().error_level = CUDAEnsemble::EnsembleConfig::Slow; + ensemble.Config().concurrent_runs = 1; // Single device/no concurrency to ensure we get consistent data + ensemble.Config().devices = { 0 }; + // Simulate the ensemble, + EXPECT_THROW(ensemble.simulate(plans), exception::EnsembleError); + // Check correct number of fails occurred (Unable to retrieve actual error count except from stderr with SLOW) + EXPECT_EQ(tracked_err_ct, ENSEMBLE_COUNT / 2); + EXPECT_EQ(tracked_runs_ct, ENSEMBLE_COUNT); +} +TEST(TestCUDAEnsemble, ErrorFast) { + tracked_err_ct = 0; + tracked_runs_ct = 0; + // Create a model containing atleast one agent type and function. + flamegpu::ModelDescription model("test"); + // Environmental constant for initial population + model.Environment().newProperty("POPULATION_TO_GENERATE", 1, true); + // Agent(s) + flamegpu::AgentDescription& agent = model.newAgent("Agent"); + agent.newVariable("counter", 0); + // Control flow + model.addInitFunction(elapsedInit); + model.addStepFunction(throwException); + // Create a set of 10 Run plans + const unsigned int ENSEMBLE_COUNT = 10; + auto plans = flamegpu::RunPlanVector(model, ENSEMBLE_COUNT); + plans.setSteps(1); + // Create an ensemble + flamegpu::CUDAEnsemble ensemble(model); + // Make it quiet to avoid outputting during the test suite + ensemble.Config().quiet = true; + ensemble.Config().out_format = ""; // Suppress warning + ensemble.Config().error_level = CUDAEnsemble::EnsembleConfig::Fast; + ensemble.Config().concurrent_runs = 1; // Single device/no concurrency to ensure we get consistent data + ensemble.Config().devices = { 0 }; + // Simulate the ensemble, + EXPECT_THROW(ensemble.simulate(plans), exception::EnsembleError); + // Check correct number of fails occurred (Fast kills ensemble as soon as first error occurs) + EXPECT_EQ(tracked_err_ct, 1u); + // The first run does not throw + EXPECT_EQ(tracked_runs_ct, 2u); +} } // namespace test_cuda_ensemble } // namespace tests