Skip to content

Commit

Permalink
Improve CUDAEnsemble's error reporting.
Browse files Browse the repository at this point in the history
With 3 additional tests that the 3 levels of reporting all work.
  • Loading branch information
Robadob committed May 17, 2022
1 parent e3fe3e5 commit bd7b2db
Show file tree
Hide file tree
Showing 9 changed files with 207 additions and 10 deletions.
4 changes: 4 additions & 0 deletions include/flamegpu/exception/FLAMEGPUException.h
Original file line number Diff line number Diff line change
Expand Up @@ -411,6 +411,10 @@ DERIVED_FLAMEGPUException(InvalidDependencyGraph, "Agent function dependency gra
* This should not occur if the shared ID matches ID_NOT_SET
*/
DERIVED_FLAMEGPUException(AgentIDCollision, "Multiple agents of same type share an ID");
/**
* Defines an error when runs fail during an ensemble's execution
*/
DERIVED_FLAMEGPUException(EnsembleError, "One of more runs failed during the ensemble's execution");

} // namespace exception
} // namespace flamegpu
Expand Down
10 changes: 9 additions & 1 deletion include/flamegpu/gpu/CUDAEnsemble.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,13 @@ class CUDAEnsemble {
* This is independent of the EnsembleConfig::quiet
*/
bool timing = false;
enum ErrorLevel { Off = 0, Slow = 1, Fast = 2 };
/**
* Off: Runs which fail do not cause an exception to be raised. Failed runs must be probed manually via checking the return value of calls to CUDAEnsemble::simulate()
* Slow: If any runs fail, an EnsembleException will be raised after all runs have been attempted, before CUDAEnsemble::simulate() returns.
* Fast: An EnsembleException will be raised as soon as a failed run is detected, cancelling remaining runs.
*/
ErrorLevel error_level = Slow;
};
/**
* Initialise CUDA Ensemble
Expand All @@ -72,8 +79,9 @@ class CUDAEnsemble {
* Execute the ensemble of simulations.
* This call will block until all simulations have completed or MAX_ERRORS simulations exit with an error
* @param plan The plan of individual runs to execute during the ensemble
* @return 0 on success, otherwise the number of runs which reported errors and failed
*/
void simulate(const RunPlanVector &plan);
unsigned int simulate(const RunPlanVector &plan);

/**
* @return A mutable reference to the ensemble configuration struct
Expand Down
2 changes: 1 addition & 1 deletion include/flamegpu/model/EnvironmentDescription.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ class EnvironmentDescription {
friend class CUDASimulation;

friend class SimRunner;
friend void CUDAEnsemble::simulate(const RunPlanVector &plans);
friend unsigned int CUDAEnsemble::simulate(const RunPlanVector &plans);

public:
/**
Expand Down
2 changes: 1 addition & 1 deletion include/flamegpu/sim/LoggingConfig.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ class LoggingConfig {
/**
* Requires access to log_timing
*/
friend void CUDAEnsemble::simulate(const RunPlanVector& plans);
friend unsigned int CUDAEnsemble::simulate(const RunPlanVector& plans);

public:
/**
Expand Down
2 changes: 1 addition & 1 deletion include/flamegpu/sim/RunPlanVector.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ class EnvironmentDescription;
class RunPlanVector : private std::vector<RunPlan> {
friend class RunPlan;
friend class SimRunner;
friend void CUDAEnsemble::simulate(const RunPlanVector &plans);
friend unsigned int CUDAEnsemble::simulate(const RunPlanVector &plans);

public:
/**
Expand Down
21 changes: 20 additions & 1 deletion include/flamegpu/sim/SimRunner.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include <condition_variable>
#include <thread>
#include <vector>
#include <string>
#include "flamegpu/sim/LogFrame.h"

namespace flamegpu {
Expand All @@ -25,6 +26,12 @@ class RunPlanVector;
*/
class SimRunner {
friend class CUDAEnsemble;
struct ErrorDetail {
unsigned int run_id;
unsigned int device_id;
unsigned int runner_id;
std::string exception_string;
};
/**
* Constructor, creates and initialise a new SimRunner
* @param _model A copy of the ModelDescription hierarchy for the RunPlanVector, this is used to create the CUDASimulation instances.
Expand All @@ -36,10 +43,12 @@ class SimRunner {
* @param _device_id The GPU that all runs should execute on
* @param _runner_id A unique index assigned to the runner
* @param _verbose If true more information will be written to stdout
* @param _fail_fast If true, the SimRunner will kill other runners and throw an exception on error
* @param run_logs Reference to the vector to store generate run logs
* @param log_export_queue The queue of logs to exported to disk
* @param log_export_queue_mutex This mutex must be locked to access log_export_queue
* @param log_export_queue_cdn The condition is notified every time a log has been added to the queue
* @param fast_err_detail Structure to store error details on fast failure for main thread rethrow
*/
SimRunner(const std::shared_ptr<const ModelData> _model,
std::atomic<unsigned int> &_err_ct,
Expand All @@ -50,10 +59,12 @@ class SimRunner {
int _device_id,
unsigned int _runner_id,
bool _verbose,
bool _fail_fast,
std::vector<RunLog> &run_logs,
std::queue<unsigned int> &log_export_queue,
std::mutex &log_export_queue_mutex,
std::condition_variable &log_export_queue_cdn);
std::condition_variable &log_export_queue_cdn,
ErrorDetail &fast_err_detail);
/**
* Each sim runner takes it's own clone of model description hierarchy, so it can manipulate environment without conflict
*/
Expand All @@ -74,6 +85,10 @@ class SimRunner {
* Flag for whether to print progress
*/
const bool verbose;
/**
* Flag for whether the ensemble should throw an exception if it errors out
*/
const bool fail_fast;
/**
* The thread which the SimRunner executes on
*/
Expand Down Expand Up @@ -119,6 +134,10 @@ class SimRunner {
* The condition is notified every time a log has been added to the queue
*/
std::condition_variable &log_export_queue_cdn;
/**
* If fail_fast is true, on error details will be stored here so an exception can be thrown from the main thread
*/
ErrorDetail& fast_err_detail;
};

} // namespace flamegpu
Expand Down
40 changes: 38 additions & 2 deletions src/flamegpu/gpu/CUDAEnsemble.cu
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ CUDAEnsemble::~CUDAEnsemble() {



void CUDAEnsemble::simulate(const RunPlanVector &plans) {
unsigned int CUDAEnsemble::simulate(const RunPlanVector &plans) {
// Validate that RunPlan model matches CUDAEnsemble model
if (*plans.environment != this->model->environment->properties) {
THROW exception::InvalidArgument("RunPlan is for a different ModelDescription, in CUDAEnsemble::simulate()");
Expand Down Expand Up @@ -111,6 +111,7 @@ void CUDAEnsemble::simulate(const RunPlanVector &plans) {
std::queue<unsigned int> log_export_queue;
std::mutex log_export_queue_mutex;
std::condition_variable log_export_queue_cdn;
SimRunner::ErrorDetail fast_err_detail = {};

// Init with placement new
{
Expand All @@ -121,7 +122,11 @@ void CUDAEnsemble::simulate(const RunPlanVector &plans) {
unsigned int i = 0;
for (auto &d : devices) {
for (unsigned int j = 0; j < config.concurrent_runs; ++j) {
new (&runners[i++]) SimRunner(model, err_ct, next_run, plans, step_log_config, exit_log_config, d, j, !config.quiet, run_logs, log_export_queue, log_export_queue_mutex, log_export_queue_cdn);
new (&runners[i++]) SimRunner(model, err_ct, next_run, plans,
step_log_config, exit_log_config,
d, j,
!config.quiet, config.error_level == EnsembleConfig::Fast,
run_logs, log_export_queue, log_export_queue_mutex, log_export_queue_cdn, fast_err_detail);
}
}
}
Expand Down Expand Up @@ -174,6 +179,15 @@ void CUDAEnsemble::simulate(const RunPlanVector &plans) {

// Free memory
free(runners);

if (config.error_level == EnsembleConfig::Fast && err_ct.load()) {
THROW exception::EnsembleError("Run %u failed on device %d, thread %u with exception: \n%s\n",
fast_err_detail.run_id, fast_err_detail.device_id, fast_err_detail.runner_id, fast_err_detail.exception_string.c_str());
} else if (config.error_level == EnsembleConfig::Slow && err_ct.load()) {
THROW exception::EnsembleError("%u/%u runs failed!\n.", err_ct.load(), static_cast<unsigned int>(plans.size()));
}

return err_ct.load();
}

void CUDAEnsemble::initialise(int argc, const char** argv) {
Expand Down Expand Up @@ -273,6 +287,27 @@ int CUDAEnsemble::checkArgs(int argc, const char** argv) {
config.timing = true;
continue;
}
// -e/--error, Specify the error level
if (arg.compare("--error") == 0 || arg.compare("-e") == 0) {
if (i + 1 >= argc) {
fprintf(stderr, "%s requires a trailing argument\n", arg.c_str());
return false;
}
std::string error_level_string = argv[++i];
// Shift the trailing arg to lower
std::transform(error_level_string.begin(), error_level_string.end(), error_level_string.begin(), [](unsigned char c) { return std::use_facet< std::ctype<char>>(std::locale()).tolower(c); });
if (arg.compare("off") == 0 || arg.compare(std::to_string(EnsembleConfig::Off)) == 0) {
config.error_level = EnsembleConfig::Off;
} else if (arg.compare("slow") == 0 || arg.compare(std::to_string(EnsembleConfig::Slow)) == 0) {
config.error_level = EnsembleConfig::Slow;
} else if (arg.compare("fast") == 0 || arg.compare(std::to_string(EnsembleConfig::Fast)) == 0) {
config.error_level = EnsembleConfig::Fast;
} else {
fprintf(stderr, "%s is not an appropriate argument for %s\n", error_level_string.c_str(), arg.c_str());
return false;
}
continue;
}
fprintf(stderr, "Unexpected argument: %s\n", arg.c_str());
printHelp(argv[0]);
return false;
Expand All @@ -292,6 +327,7 @@ void CUDAEnsemble::printHelp(const char *executable) {
printf(line_fmt, "-o, --out <directory> <filetype>", "Directory and filetype for ensemble outputs");
printf(line_fmt, "-q, --quiet", "Don't print progress information to console");
printf(line_fmt, "-t, --timing", "Output timing information to stdout");
printf(line_fmt, "-e, --error <error level>", "The error level 0, 1, 2, off, slow or fast");
}
void CUDAEnsemble::setStepLog(const StepLoggingConfig &stepConfig) {
// Validate ModelDescription matches
Expand Down
30 changes: 27 additions & 3 deletions src/flamegpu/sim/SimRunner.cu
Original file line number Diff line number Diff line change
Expand Up @@ -23,15 +23,18 @@ SimRunner::SimRunner(const std::shared_ptr<const ModelData> _model,
int _device_id,
unsigned int _runner_id,
bool _verbose,
bool _fail_fast,
std::vector<RunLog> &_run_logs,
std::queue<unsigned int> &_log_export_queue,
std::mutex &_log_export_queue_mutex,
std::condition_variable &_log_export_queue_cdn)
std::condition_variable &_log_export_queue_cdn,
ErrorDetail &_fast_err_detail)
: model(_model->clone())
, run_id(0)
, device_id(_device_id)
, runner_id(_runner_id)
, verbose(_verbose)
, fail_fast(_fail_fast)
, err_ct(_err_ct)
, next_run(_next_run)
, plans(_plans)
Expand All @@ -40,7 +43,8 @@ SimRunner::SimRunner(const std::shared_ptr<const ModelData> _model,
, run_logs(_run_logs)
, log_export_queue(_log_export_queue)
, log_export_queue_mutex(_log_export_queue_mutex)
, log_export_queue_cdn(_log_export_queue_cdn) {
, log_export_queue_cdn(_log_export_queue_cdn)
, fast_err_detail(_fast_err_detail) {
this->thread = std::thread(&SimRunner::start, this);
// Attempt to name the thread
#ifdef _MSC_VER
Expand Down Expand Up @@ -103,7 +107,27 @@ void SimRunner::start() {
fflush(stdout);
}
} catch(std::exception &e) {
fprintf(stderr, "\nRun %u failed on device %d, thread %u with exception: \n%s\n", run_id, device_id, runner_id, e.what());
++err_ct;
if (this->fail_fast) {
// Kill the other workers early
next_run += static_cast<unsigned int>(plans.size());
{
std::lock_guard<std::mutex> lck(log_export_queue_mutex);
log_export_queue.push(UINT_MAX);
// log_export_mutex is treated as our protection for race conditions on fast_err_detail
fast_err_detail.run_id = run_id;
fast_err_detail.device_id = device_id;
fast_err_detail.runner_id = runner_id;
fast_err_detail.exception_string = e.what();
}
return;
} else {
if (verbose) {
fprintf(stdout, "\n");
fflush(stdout);
}
fprintf(stderr, "Run %u failed on device %d, thread %u with exception: \n%s\n", run_id, device_id, runner_id, e.what());
}
}
}
}
Expand Down
106 changes: 106 additions & 0 deletions tests/test_cases/gpu/test_cuda_ensemble.cu
Original file line number Diff line number Diff line change
Expand Up @@ -349,6 +349,112 @@ TEST(TestCUDAEnsemble, getEnsembleElapsedTime) {
double threshold = sleepDurationSeconds * 0.8;
EXPECT_GE(elapsedSeconds, threshold);
}
unsigned int tracked_err_ct;
unsigned int tracked_runs_ct;
FLAMEGPU_STEP_FUNCTION(throwException) {
++tracked_runs_ct;
static int i = 0;
if (++i % 2 == 0) {
++tracked_err_ct;
THROW exception::UnknownInternalError("Dummy Exception");
}
}
TEST(TestCUDAEnsemble, ErrorOff) {
tracked_err_ct = 0;
tracked_runs_ct = 0;
// Create a model containing atleast one agent type and function.
flamegpu::ModelDescription model("test");
// Environmental constant for initial population
model.Environment().newProperty<uint32_t>("POPULATION_TO_GENERATE", 1, true);
// Agent(s)
flamegpu::AgentDescription& agent = model.newAgent("Agent");
agent.newVariable<uint32_t>("counter", 0);
// Control flow
model.addInitFunction(elapsedInit);
model.addStepFunction(throwException);
// Create a set of 10 Run plans
const unsigned int ENSEMBLE_COUNT = 10;
auto plans = flamegpu::RunPlanVector(model, ENSEMBLE_COUNT);
plans.setSteps(1);
// Create an ensemble
flamegpu::CUDAEnsemble ensemble(model);
// Make it quiet to avoid outputting during the test suite
ensemble.Config().quiet = true;
ensemble.Config().out_format = ""; // Suppress warning
ensemble.Config().error_level = CUDAEnsemble::EnsembleConfig::Off;
ensemble.Config().concurrent_runs = 1; // Single device/no concurrency to ensure we get consistent data
ensemble.Config().devices = {0};
unsigned int reported_err_ct = 0;
// Simulate the ensemble,
EXPECT_NO_THROW(reported_err_ct = ensemble.simulate(plans));
// Check correct number of fails is reported
EXPECT_EQ(reported_err_ct, ENSEMBLE_COUNT / 2);
EXPECT_EQ(tracked_err_ct, ENSEMBLE_COUNT / 2);
EXPECT_EQ(tracked_runs_ct, ENSEMBLE_COUNT);
}
TEST(TestCUDAEnsemble, ErrorSlow) {
tracked_err_ct = 0;
tracked_runs_ct = 0;
// Create a model containing atleast one agent type and function.
flamegpu::ModelDescription model("test");
// Environmental constant for initial population
model.Environment().newProperty<uint32_t>("POPULATION_TO_GENERATE", 1, true);
// Agent(s)
flamegpu::AgentDescription& agent = model.newAgent("Agent");
agent.newVariable<uint32_t>("counter", 0);
// Control flow
model.addInitFunction(elapsedInit);
model.addStepFunction(throwException);
// Create a set of 10 Run plans
const unsigned int ENSEMBLE_COUNT = 10;
auto plans = flamegpu::RunPlanVector(model, ENSEMBLE_COUNT);
plans.setSteps(1);
// Create an ensemble
flamegpu::CUDAEnsemble ensemble(model);
// Make it quiet to avoid outputting during the test suite
ensemble.Config().quiet = true;
ensemble.Config().out_format = ""; // Suppress warning
ensemble.Config().error_level = CUDAEnsemble::EnsembleConfig::Slow;
ensemble.Config().concurrent_runs = 1; // Single device/no concurrency to ensure we get consistent data
ensemble.Config().devices = { 0 };
// Simulate the ensemble,
EXPECT_THROW(ensemble.simulate(plans), exception::EnsembleError);
// Check correct number of fails occurred (Unable to retrieve actual error count except from stderr with SLOW)
EXPECT_EQ(tracked_err_ct, ENSEMBLE_COUNT / 2);
EXPECT_EQ(tracked_runs_ct, ENSEMBLE_COUNT);
}
TEST(TestCUDAEnsemble, ErrorFast) {
tracked_err_ct = 0;
tracked_runs_ct = 0;
// Create a model containing atleast one agent type and function.
flamegpu::ModelDescription model("test");
// Environmental constant for initial population
model.Environment().newProperty<uint32_t>("POPULATION_TO_GENERATE", 1, true);
// Agent(s)
flamegpu::AgentDescription& agent = model.newAgent("Agent");
agent.newVariable<uint32_t>("counter", 0);
// Control flow
model.addInitFunction(elapsedInit);
model.addStepFunction(throwException);
// Create a set of 10 Run plans
const unsigned int ENSEMBLE_COUNT = 10;
auto plans = flamegpu::RunPlanVector(model, ENSEMBLE_COUNT);
plans.setSteps(1);
// Create an ensemble
flamegpu::CUDAEnsemble ensemble(model);
// Make it quiet to avoid outputting during the test suite
ensemble.Config().quiet = true;
ensemble.Config().out_format = ""; // Suppress warning
ensemble.Config().error_level = CUDAEnsemble::EnsembleConfig::Fast;
ensemble.Config().concurrent_runs = 1; // Single device/no concurrency to ensure we get consistent data
ensemble.Config().devices = { 0 };
// Simulate the ensemble,
EXPECT_THROW(ensemble.simulate(plans), exception::EnsembleError);
// Check correct number of fails occurred (Fast kills ensemble as soon as first error occurs)
EXPECT_EQ(tracked_err_ct, 1u);
// The first run does not throw
EXPECT_EQ(tracked_runs_ct, 2u);
}

} // namespace test_cuda_ensemble
} // namespace tests
Expand Down

0 comments on commit bd7b2db

Please sign in to comment.