Skip to content

Commit

Permalink
feat(storcon): chaos injection of force exit (#10934)
Browse files Browse the repository at this point in the history
## Problem

close neondatabase/cloud#24485

## Summary of changes

This patch adds a new chaos injection mode for the storcon. The chaos
injector reads the crontab and exits immediately at the configured time.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
  • Loading branch information
skyzh authored Feb 24, 2025
1 parent fdde581 commit 5fad4a4
Show file tree
Hide file tree
Showing 5 changed files with 92 additions and 16 deletions.
16 changes: 14 additions & 2 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ byteorder = "1.4"
bytes = "1.9"
camino = "1.1.6"
cfg-if = "1.0.0"
cron = "0.15"
chrono = { version = "0.4", default-features = false, features = ["clock"] }
clap = { version = "4.0", features = ["derive", "env"] }
clashmap = { version = "1.0", features = ["raw-api"] }
Expand Down
1 change: 1 addition & 0 deletions storage_controller/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ anyhow.workspace = true
bytes.workspace = true
chrono.workspace = true
clap.workspace = true
cron.workspace = true
fail.workspace = true
futures.workspace = true
hex.workspace = true
Expand Down
10 changes: 8 additions & 2 deletions storage_controller/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -115,10 +115,14 @@ struct Cli {
#[arg(long)]
neon_local_repo_dir: Option<PathBuf>,

/// Chaos testing
/// Chaos testing: exercise tenant migrations
#[arg(long)]
chaos_interval: Option<humantime::Duration>,

/// Chaos testing: exercise an immediate exit
#[arg(long)]
chaos_exit_crontab: Option<cron::Schedule>,

// Maximum acceptable lag for the secondary location while draining
// a pageserver
#[arg(long)]
Expand Down Expand Up @@ -382,10 +386,12 @@ async fn async_main() -> anyhow::Result<()> {
let service = service.clone();
let cancel = CancellationToken::new();
let cancel_bg = cancel.clone();
let chaos_exit_crontab = args.chaos_exit_crontab;
(
tokio::task::spawn(
async move {
let mut chaos_injector = ChaosInjector::new(service, interval.into());
let mut chaos_injector =
ChaosInjector::new(service, interval.into(), chaos_exit_crontab);
chaos_injector.run(cancel_bg).await
}
.instrument(tracing::info_span!("chaos_injector")),
Expand Down
80 changes: 68 additions & 12 deletions storage_controller/src/service/chaos_injector.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,29 +16,80 @@ use super::{Node, Scheduler, Service, TenantShard};
pub struct ChaosInjector {
service: Arc<Service>,
interval: Duration,
chaos_exit_crontab: Option<cron::Schedule>,
}

fn cron_to_next_duration(cron: &cron::Schedule) -> anyhow::Result<tokio::time::Sleep> {
use chrono::Utc;
let next = cron.upcoming(Utc).next().unwrap();
let duration = (next - Utc::now()).to_std()?;
Ok(tokio::time::sleep(duration))
}

async fn maybe_sleep(sleep: Option<tokio::time::Sleep>) -> Option<()> {
if let Some(sleep) = sleep {
sleep.await;
Some(())
} else {
None
}
}

impl ChaosInjector {
pub fn new(service: Arc<Service>, interval: Duration) -> Self {
Self { service, interval }
pub fn new(
service: Arc<Service>,
interval: Duration,
chaos_exit_crontab: Option<cron::Schedule>,
) -> Self {
Self {
service,
interval,
chaos_exit_crontab,
}
}

pub async fn run(&mut self, cancel: CancellationToken) {
let mut interval = tokio::time::interval(self.interval);

loop {
tokio::select! {
_ = interval.tick() => {}
_ = cancel.cancelled() => {
tracing::info!("Shutting down");
return;
let cron_interval = {
if let Some(ref chaos_exit_crontab) = self.chaos_exit_crontab {
match cron_to_next_duration(chaos_exit_crontab) {
Ok(interval_exit) => Some(interval_exit),
Err(e) => {
tracing::error!("Error processing the cron schedule: {e}");
None
}
}
} else {
None
}
};
enum ChaosEvent {
ShuffleTenant,
ForceKill,
}
let chaos_type = tokio::select! {
_ = interval.tick() => {
ChaosEvent::ShuffleTenant
}
Some(_) = maybe_sleep(cron_interval) => {
ChaosEvent::ForceKill
}
_ = cancel.cancelled() => {
tracing::info!("Shutting down");
return;
}
};

self.inject_chaos().await;

tracing::info!("Chaos iteration...");
match chaos_type {
ChaosEvent::ShuffleTenant => {
self.inject_chaos().await;
}
ChaosEvent::ForceKill => {
self.force_kill().await;
}
}

tracing::info!("Chaos iteration...");
}

/// If a shard has a secondary and attached location, then re-assign the secondary to be
Expand Down Expand Up @@ -95,6 +146,11 @@ impl ChaosInjector {
);
}

async fn force_kill(&mut self) {
tracing::warn!("Injecting chaos: force kill");
std::process::exit(1);
}

async fn inject_chaos(&mut self) {
// Pick some shards to interfere with
let batch_size = 128;
Expand Down

1 comment on commit 5fad4a4

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

7141 tests run: 6786 passed, 13 failed, 342 skipped (full report)


Failures on Postgres 16

  • test_throughput[github-actions-selfhosted-50-pipelining_config7-30-100-128-batchable {'max_batch_size': 1, 'execution': 'tasks', 'mode': 'pipelined'}]: release-x86-64-with-lfc
  • test_throughput[github-actions-selfhosted-50-pipelining_config8-30-100-128-batchable {'max_batch_size': 2, 'execution': 'concurrent-futures', 'mode': 'pipelined'}]: release-x86-64-with-lfc
  • test_throughput[github-actions-selfhosted-50-pipelining_config9-30-100-128-batchable {'max_batch_size': 2, 'execution': 'tasks', 'mode': 'pipelined'}]: release-x86-64-with-lfc
  • test_throughput[github-actions-selfhosted-50-pipelining_config10-30-100-128-batchable {'max_batch_size': 4, 'execution': 'concurrent-futures', 'mode': 'pipelined'}]: release-x86-64-with-lfc
  • test_throughput[github-actions-selfhosted-50-pipelining_config11-30-100-128-batchable {'max_batch_size': 4, 'execution': 'tasks', 'mode': 'pipelined'}]: release-x86-64-with-lfc
  • test_throughput[github-actions-selfhosted-50-pipelining_config12-30-100-128-batchable {'max_batch_size': 8, 'execution': 'concurrent-futures', 'mode': 'pipelined'}]: release-x86-64-with-lfc
  • test_throughput[github-actions-selfhosted-50-pipelining_config13-30-100-128-batchable {'max_batch_size': 8, 'execution': 'tasks', 'mode': 'pipelined'}]: release-x86-64-with-lfc
  • test_throughput[github-actions-selfhosted-50-pipelining_config14-30-100-128-batchable {'max_batch_size': 16, 'execution': 'concurrent-futures', 'mode': 'pipelined'}]: release-x86-64-with-lfc
  • test_throughput[github-actions-selfhosted-50-pipelining_config15-30-100-128-batchable {'max_batch_size': 16, 'execution': 'tasks', 'mode': 'pipelined'}]: release-x86-64-with-lfc
  • test_throughput[github-actions-selfhosted-50-pipelining_config16-30-100-128-batchable {'max_batch_size': 32, 'execution': 'concurrent-futures', 'mode': 'pipelined'}]: release-x86-64-with-lfc
  • test_throughput[github-actions-selfhosted-50-pipelining_config17-30-100-128-batchable {'max_batch_size': 32, 'execution': 'tasks', 'mode': 'pipelined'}]: release-x86-64-with-lfc
  • test_throughput[github-actions-selfhosted-50-pipelining_config5-30-100-128-batchable {'mode': 'serial'}]: release-x86-64-with-lfc
  • test_throughput[github-actions-selfhosted-50-pipelining_config6-30-100-128-batchable {'max_batch_size': 1, 'execution': 'concurrent-futures', 'mode': 'pipelined'}]: release-x86-64-with-lfc
# Run all failed tests locally:
scripts/pytest -vv -n $(nproc) -k "test_throughput[release-pg16-github-actions-selfhosted-50-pipelining_config7-30-100-128-batchable {'max_batch_size': 1, 'execution': 'tasks', 'mode': 'pipelined'}] or test_throughput[release-pg16-github-actions-selfhosted-50-pipelining_config8-30-100-128-batchable {'max_batch_size': 2, 'execution': 'concurrent-futures', 'mode': 'pipelined'}] or test_throughput[release-pg16-github-actions-selfhosted-50-pipelining_config9-30-100-128-batchable {'max_batch_size': 2, 'execution': 'tasks', 'mode': 'pipelined'}] or test_throughput[release-pg16-github-actions-selfhosted-50-pipelining_config10-30-100-128-batchable {'max_batch_size': 4, 'execution': 'concurrent-futures', 'mode': 'pipelined'}] or test_throughput[release-pg16-github-actions-selfhosted-50-pipelining_config11-30-100-128-batchable {'max_batch_size': 4, 'execution': 'tasks', 'mode': 'pipelined'}] or test_throughput[release-pg16-github-actions-selfhosted-50-pipelining_config12-30-100-128-batchable {'max_batch_size': 8, 'execution': 'concurrent-futures', 'mode': 'pipelined'}] or test_throughput[release-pg16-github-actions-selfhosted-50-pipelining_config13-30-100-128-batchable {'max_batch_size': 8, 'execution': 'tasks', 'mode': 'pipelined'}] or test_throughput[release-pg16-github-actions-selfhosted-50-pipelining_config14-30-100-128-batchable {'max_batch_size': 16, 'execution': 'concurrent-futures', 'mode': 'pipelined'}] or test_throughput[release-pg16-github-actions-selfhosted-50-pipelining_config15-30-100-128-batchable {'max_batch_size': 16, 'execution': 'tasks', 'mode': 'pipelined'}] or test_throughput[release-pg16-github-actions-selfhosted-50-pipelining_config16-30-100-128-batchable {'max_batch_size': 32, 'execution': 'concurrent-futures', 'mode': 'pipelined'}] or test_throughput[release-pg16-github-actions-selfhosted-50-pipelining_config17-30-100-128-batchable {'max_batch_size': 32, 'execution': 'tasks', 'mode': 'pipelined'}] or test_throughput[release-pg16-github-actions-selfhosted-50-pipelining_config5-30-100-128-batchable {'mode': 'serial'}] or test_throughput[release-pg16-github-actions-selfhosted-50-pipelining_config6-30-100-128-batchable {'max_batch_size': 1, 'execution': 'concurrent-futures', 'mode': 'pipelined'}]"
Flaky tests (1)

Postgres 15

Test coverage report is not available

The comment gets automatically updated with the latest test results
5fad4a4 at 2025-02-24T20:43:11.527Z :recycle:

Please sign in to comment.