Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve CUDAEnsemble's error reporting. #839

Merged
merged 1 commit into from
May 18, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions include/flamegpu/exception/FLAMEGPUException.h
Original file line number Diff line number Diff line change
Expand Up @@ -411,6 +411,10 @@ DERIVED_FLAMEGPUException(InvalidDependencyGraph, "Agent function dependency gra
* This should not occur if the shared ID matches ID_NOT_SET
*/
DERIVED_FLAMEGPUException(AgentIDCollision, "Multiple agents of same type share an ID");
/**
* Defines an error when runs fail during an ensemble's execution
*/
DERIVED_FLAMEGPUException(EnsembleError, "One of more runs failed during the ensemble's execution");

} // namespace exception
} // namespace flamegpu
Expand Down
21 changes: 18 additions & 3 deletions include/flamegpu/gpu/CUDAEnsemble.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,34 +25,47 @@ class CUDAEnsemble {
* Execution config for running a CUDAEnsemble
*/
struct EnsembleConfig {
// std::string in = "";
/**
* Directory to store output data (primarily logs)
* Defaults to "" (the working directory, no subdirectory)
*/
std::string out_directory = "";
/**
* Output format
* This must be a supported format e.g.: "json" or "xml"
* Defaults to "json"
*/
std::string out_format = "json";
/**
* The maximum number of concurrent runs
* Defaults to 4
*/
unsigned int concurrent_runs = 4;
/**
* The CUDA device ids of devices to be used
* If this is left empty, all available devices will be used
* Defaults to empty set (all available devices)
*/
std::set<int> devices;
/**
* If true progress logging to stdout will be suppressed
* Defaults to false
*/
bool quiet = false;
/**
* If true, the total runtime for the ensemble will be printed to stdout at completion
* This is independent of the EnsembleConfig::quiet
* Defaults to false
*/
bool timing = false;
enum ErrorLevel { Off = 0, Slow = 1, Fast = 2 };
/**
* Off: Runs which fail do not cause an exception to be raised. Failed runs must be probed manually via checking the return value of calls to CUDAEnsemble::simulate()
* Slow: If any runs fail, an EnsembleException will be raised after all runs have been attempted, before CUDAEnsemble::simulate() returns.
* Fast: An EnsembleException will be raised as soon as a failed run is detected, cancelling remaining runs.
* Defaults to Slow
*/
ErrorLevel error_level = Slow;
};
/**
* Initialise CUDA Ensemble
Expand All @@ -70,10 +83,12 @@ class CUDAEnsemble {

/**
* Execute the ensemble of simulations.
* This call will block until all simulations have completed or MAX_ERRORS simulations exit with an error
* This call will normally block until all simulations have completed, however may exit early if an error occurs with the error_level configuration set to Fast
* @param plan The plan of individual runs to execute during the ensemble
* @return 0 on success, otherwise the number of runs which reported errors and failed
* @see CUDAEnsemble::EnsembleConfig::error_level
*/
void simulate(const RunPlanVector &plan);
unsigned int simulate(const RunPlanVector &plan);

/**
* @return A mutable reference to the ensemble configuration struct
Expand Down
2 changes: 1 addition & 1 deletion include/flamegpu/model/EnvironmentDescription.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ class EnvironmentDescription {
friend class CUDASimulation;

friend class SimRunner;
friend void CUDAEnsemble::simulate(const RunPlanVector &plans);
friend unsigned int CUDAEnsemble::simulate(const RunPlanVector &plans);

public:
/**
Expand Down
2 changes: 1 addition & 1 deletion include/flamegpu/sim/LoggingConfig.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ class LoggingConfig {
/**
* Requires access to log_timing
*/
friend void CUDAEnsemble::simulate(const RunPlanVector& plans);
friend unsigned int CUDAEnsemble::simulate(const RunPlanVector& plans);

public:
/**
Expand Down
2 changes: 1 addition & 1 deletion include/flamegpu/sim/RunPlanVector.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ class EnvironmentDescription;
class RunPlanVector : private std::vector<RunPlan> {
friend class RunPlan;
friend class SimRunner;
friend void CUDAEnsemble::simulate(const RunPlanVector &plans);
friend unsigned int CUDAEnsemble::simulate(const RunPlanVector &plans);

public:
/**
Expand Down
21 changes: 20 additions & 1 deletion include/flamegpu/sim/SimRunner.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include <condition_variable>
#include <thread>
#include <vector>
#include <string>
#include "flamegpu/sim/LogFrame.h"

namespace flamegpu {
Expand All @@ -25,6 +26,12 @@ class RunPlanVector;
*/
class SimRunner {
friend class CUDAEnsemble;
struct ErrorDetail {
unsigned int run_id;
unsigned int device_id;
unsigned int runner_id;
std::string exception_string;
};
/**
* Constructor, creates and initialise a new SimRunner
* @param _model A copy of the ModelDescription hierarchy for the RunPlanVector, this is used to create the CUDASimulation instances.
Expand All @@ -36,10 +43,12 @@ class SimRunner {
* @param _device_id The GPU that all runs should execute on
* @param _runner_id A unique index assigned to the runner
* @param _verbose If true more information will be written to stdout
* @param _fail_fast If true, the SimRunner will kill other runners and throw an exception on error
* @param run_logs Reference to the vector to store generate run logs
* @param log_export_queue The queue of logs to exported to disk
* @param log_export_queue_mutex This mutex must be locked to access log_export_queue
* @param log_export_queue_cdn The condition is notified every time a log has been added to the queue
* @param fast_err_detail Structure to store error details on fast failure for main thread rethrow
*/
SimRunner(const std::shared_ptr<const ModelData> _model,
std::atomic<unsigned int> &_err_ct,
Expand All @@ -50,10 +59,12 @@ class SimRunner {
int _device_id,
unsigned int _runner_id,
bool _verbose,
bool _fail_fast,
std::vector<RunLog> &run_logs,
std::queue<unsigned int> &log_export_queue,
std::mutex &log_export_queue_mutex,
std::condition_variable &log_export_queue_cdn);
std::condition_variable &log_export_queue_cdn,
ErrorDetail &fast_err_detail);
/**
* Each sim runner takes it's own clone of model description hierarchy, so it can manipulate environment without conflict
*/
Expand All @@ -74,6 +85,10 @@ class SimRunner {
* Flag for whether to print progress
*/
const bool verbose;
/**
* Flag for whether the ensemble should throw an exception if it errors out
*/
const bool fail_fast;
/**
* The thread which the SimRunner executes on
*/
Expand Down Expand Up @@ -119,6 +134,10 @@ class SimRunner {
* The condition is notified every time a log has been added to the queue
*/
std::condition_variable &log_export_queue_cdn;
/**
* If fail_fast is true, on error details will be stored here so an exception can be thrown from the main thread
*/
ErrorDetail& fast_err_detail;
};

} // namespace flamegpu
Expand Down
41 changes: 39 additions & 2 deletions src/flamegpu/gpu/CUDAEnsemble.cu
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ CUDAEnsemble::~CUDAEnsemble() {



void CUDAEnsemble::simulate(const RunPlanVector &plans) {
unsigned int CUDAEnsemble::simulate(const RunPlanVector &plans) {
// Validate that RunPlan model matches CUDAEnsemble model
if (*plans.environment != this->model->environment->properties) {
THROW exception::InvalidArgument("RunPlan is for a different ModelDescription, in CUDAEnsemble::simulate()");
Expand Down Expand Up @@ -111,6 +111,7 @@ void CUDAEnsemble::simulate(const RunPlanVector &plans) {
std::queue<unsigned int> log_export_queue;
std::mutex log_export_queue_mutex;
std::condition_variable log_export_queue_cdn;
SimRunner::ErrorDetail fast_err_detail = {};

// Init with placement new
{
Expand All @@ -121,7 +122,11 @@ void CUDAEnsemble::simulate(const RunPlanVector &plans) {
unsigned int i = 0;
for (auto &d : devices) {
for (unsigned int j = 0; j < config.concurrent_runs; ++j) {
new (&runners[i++]) SimRunner(model, err_ct, next_run, plans, step_log_config, exit_log_config, d, j, !config.quiet, run_logs, log_export_queue, log_export_queue_mutex, log_export_queue_cdn);
new (&runners[i++]) SimRunner(model, err_ct, next_run, plans,
step_log_config, exit_log_config,
d, j,
!config.quiet, config.error_level == EnsembleConfig::Fast,
run_logs, log_export_queue, log_export_queue_mutex, log_export_queue_cdn, fast_err_detail);
}
}
}
Expand Down Expand Up @@ -174,6 +179,15 @@ void CUDAEnsemble::simulate(const RunPlanVector &plans) {

// Free memory
free(runners);

if (config.error_level == EnsembleConfig::Fast && err_ct.load()) {
THROW exception::EnsembleError("Run %u failed on device %d, thread %u with exception: \n%s\n",
fast_err_detail.run_id, fast_err_detail.device_id, fast_err_detail.runner_id, fast_err_detail.exception_string.c_str());
} else if (config.error_level == EnsembleConfig::Slow && err_ct.load()) {
THROW exception::EnsembleError("%u/%u runs failed!\n.", err_ct.load(), static_cast<unsigned int>(plans.size()));
}

return err_ct.load();
}

void CUDAEnsemble::initialise(int argc, const char** argv) {
Expand Down Expand Up @@ -273,6 +287,27 @@ int CUDAEnsemble::checkArgs(int argc, const char** argv) {
config.timing = true;
continue;
}
// -e/--error, Specify the error level
if (arg.compare("--error") == 0 || arg.compare("-e") == 0) {
if (i + 1 >= argc) {
fprintf(stderr, "%s requires a trailing argument\n", arg.c_str());
return false;
}
std::string error_level_string = argv[++i];
// Shift the trailing arg to lower
std::transform(error_level_string.begin(), error_level_string.end(), error_level_string.begin(), [](unsigned char c) { return std::use_facet< std::ctype<char>>(std::locale()).tolower(c); });
if (error_level_string.compare("off") == 0 || error_level_string.compare(std::to_string(EnsembleConfig::Off)) == 0) {
config.error_level = EnsembleConfig::Off;
} else if (error_level_string.compare("slow") == 0 || error_level_string.compare(std::to_string(EnsembleConfig::Slow)) == 0) {
config.error_level = EnsembleConfig::Slow;
} else if (error_level_string.compare("fast") == 0 || error_level_string.compare(std::to_string(EnsembleConfig::Fast)) == 0) {
config.error_level = EnsembleConfig::Fast;
} else {
fprintf(stderr, "%s is not an appropriate argument for %s\n", error_level_string.c_str(), arg.c_str());
return false;
}
continue;
}
fprintf(stderr, "Unexpected argument: %s\n", arg.c_str());
printHelp(argv[0]);
return false;
Expand All @@ -292,6 +327,8 @@ void CUDAEnsemble::printHelp(const char *executable) {
printf(line_fmt, "-o, --out <directory> <filetype>", "Directory and filetype for ensemble outputs");
printf(line_fmt, "-q, --quiet", "Don't print progress information to console");
printf(line_fmt, "-t, --timing", "Output timing information to stdout");
printf(line_fmt, "-e, --error <error level>", "The error level 0, 1, 2, off, slow or fast");
printf(line_fmt, "", "By default, \"slow\" will be used.");
}
void CUDAEnsemble::setStepLog(const StepLoggingConfig &stepConfig) {
// Validate ModelDescription matches
Expand Down
30 changes: 27 additions & 3 deletions src/flamegpu/sim/SimRunner.cu
Original file line number Diff line number Diff line change
Expand Up @@ -23,15 +23,18 @@ SimRunner::SimRunner(const std::shared_ptr<const ModelData> _model,
int _device_id,
unsigned int _runner_id,
bool _verbose,
bool _fail_fast,
std::vector<RunLog> &_run_logs,
std::queue<unsigned int> &_log_export_queue,
std::mutex &_log_export_queue_mutex,
std::condition_variable &_log_export_queue_cdn)
std::condition_variable &_log_export_queue_cdn,
ErrorDetail &_fast_err_detail)
: model(_model->clone())
, run_id(0)
, device_id(_device_id)
, runner_id(_runner_id)
, verbose(_verbose)
, fail_fast(_fail_fast)
, err_ct(_err_ct)
, next_run(_next_run)
, plans(_plans)
Expand All @@ -40,7 +43,8 @@ SimRunner::SimRunner(const std::shared_ptr<const ModelData> _model,
, run_logs(_run_logs)
, log_export_queue(_log_export_queue)
, log_export_queue_mutex(_log_export_queue_mutex)
, log_export_queue_cdn(_log_export_queue_cdn) {
, log_export_queue_cdn(_log_export_queue_cdn)
, fast_err_detail(_fast_err_detail) {
this->thread = std::thread(&SimRunner::start, this);
// Attempt to name the thread
#ifdef _MSC_VER
Expand Down Expand Up @@ -103,7 +107,27 @@ void SimRunner::start() {
fflush(stdout);
}
} catch(std::exception &e) {
fprintf(stderr, "\nRun %u failed on device %d, thread %u with exception: \n%s\n", run_id, device_id, runner_id, e.what());
++err_ct;
if (this->fail_fast) {
// Kill the other workers early
next_run += static_cast<unsigned int>(plans.size());
{
std::lock_guard<std::mutex> lck(log_export_queue_mutex);
log_export_queue.push(UINT_MAX);
// log_export_mutex is treated as our protection for race conditions on fast_err_detail
fast_err_detail.run_id = run_id;
fast_err_detail.device_id = device_id;
fast_err_detail.runner_id = runner_id;
fast_err_detail.exception_string = e.what();
}
return;
} else {
if (verbose) {
fprintf(stdout, "\n");
fflush(stdout);
}
fprintf(stderr, "Run %u failed on device %d, thread %u with exception: \n%s\n", run_id, device_id, runner_id, e.what());
}
}
}
}
Expand Down
Loading