From b3898e8d1a5967084c196d2a0fd9bf3d60925b29 Mon Sep 17 00:00:00 2001 From: SungJin1212 Date: Wed, 26 Feb 2025 10:01:05 +0900 Subject: [PATCH] Add for tenant silences limit Signed-off-by: SungJin1212 --- CHANGELOG.md | 1 + docs/configuration/config-file-reference.md | 9 +++ pkg/alertmanager/alertmanager.go | 9 ++- pkg/alertmanager/alertmanager_test.go | 70 +++++++++++++++++++++ pkg/alertmanager/multitenant.go | 6 ++ pkg/alertmanager/multitenant_test.go | 26 +++++++- pkg/util/validation/limits.go | 12 ++++ 7 files changed, 129 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9f44817f77..aa3eaed7c7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,7 @@ * [FEATURE] Query Frontend: Add dynamic interval size for query splitting. This is enabled by configuring experimental flags `querier.max-shards-per-query` and/or `querier.max-fetched-data-duration-per-query`. The split interval size is dynamically increased to maintain a number of shards and total duration fetched below the configured values. #6458 * [FEATURE] Querier/Ruler: Add `query_partial_data` and `rules_partial_data` limits to allow queries/rules to be evaluated with data from a single zone, if other zones are not available. #6526 * [FEATURE] Update prometheus alertmanager version to v0.28.0 and add new integration msteamsv2, jira, and rocketchat. #6590 +* [ENHANCEMENT] Alertmanager: Add new limits `-alertmanager.max-silences-count` and `-alertmanager.max-silences-size-bytes` for limiting silences per tenant. #6605 * [ENHANCEMENT] Add `compactor.auto-forget-delay` for compactor to auto forget compactors after X minutes without heartbeat. #6533 * [ENHANCEMENT] StoreGateway: Emit more histogram buckets on the `cortex_querier_storegateway_refetches_per_query` metric. #6570 * [ENHANCEMENT] Querier: Apply bytes limiter to LabelNames and LabelValuesForLabelNames. #6568 diff --git a/docs/configuration/config-file-reference.md b/docs/configuration/config-file-reference.md index 5a806a33f1..b1bfd4509e 100644 --- a/docs/configuration/config-file-reference.md +++ b/docs/configuration/config-file-reference.md @@ -3751,6 +3751,15 @@ query_rejection: # CLI flag: -alertmanager.max-alerts-size-bytes [alertmanager_max_alerts_size_bytes: | default = 0] +# Maximum number of silences that a single user can have, including expired +# silences. 0 = no limit. +# CLI flag: -alertmanager.max-silences-count +[alertmanager_max_silences_count: | default = 0] + +# Maximum size of individual silences that a single user can have. 0 = no limit. +# CLI flag: -alertmanager.max-silences-size-bytes +[alertmanager_max_silences_size_bytes: | default = 0] + # list of rule groups to disable [disabled_rule_groups: | default = []] ``` diff --git a/pkg/alertmanager/alertmanager.go b/pkg/alertmanager/alertmanager.go index 51d37bed62..1cdc3a80df 100644 --- a/pkg/alertmanager/alertmanager.go +++ b/pkg/alertmanager/alertmanager.go @@ -228,11 +228,16 @@ func New(cfg *Config, reg *prometheus.Registry) (*Alertmanager, error) { am.groupMarker = memMarker silencesFile := filepath.Join(cfg.TenantDataDir, silencesSnapshot) + am.silences, err = silence.New(silence.Options{ SnapshotFile: silencesFile, Retention: cfg.Retention, - Logger: util_log.GoKitLogToSlog(log.With(am.logger, "component", "silences")), - Metrics: am.registry, + Limits: silence.Limits{ + MaxSilences: func() int { return cfg.Limits.AlertmanagerMaxSilencesCount(cfg.UserID) }, + MaxSilenceSizeBytes: func() int { return cfg.Limits.AlertmanagerMaxSilenceSizeBytes(cfg.UserID) }, + }, + Logger: util_log.GoKitLogToSlog(log.With(am.logger, "component", "silences")), + Metrics: am.registry, }) if err != nil { return nil, fmt.Errorf("failed to create silences: %v", err) diff --git a/pkg/alertmanager/alertmanager_test.go b/pkg/alertmanager/alertmanager_test.go index 6859fb5086..c4ed3064fa 100644 --- a/pkg/alertmanager/alertmanager_test.go +++ b/pkg/alertmanager/alertmanager_test.go @@ -9,6 +9,7 @@ import ( "github.com/go-kit/log" "github.com/prometheus/alertmanager/config" + "github.com/prometheus/alertmanager/silence/silencepb" "github.com/prometheus/alertmanager/types" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/testutil" @@ -19,6 +20,75 @@ import ( "github.com/cortexproject/cortex/pkg/util/test" ) +func TestSilencesLimits(t *testing.T) { + user := "test" + + reg := prometheus.NewPedanticRegistry() + maxSilencesCount := 3 + maxSilencesSizeBytes := 500 + am, err := New(&Config{ + UserID: user, + Logger: log.NewNopLogger(), + Limits: &mockAlertManagerLimits{maxSilencesCount: maxSilencesCount, maxSilencesSizeBytes: maxSilencesSizeBytes}, + TenantDataDir: t.TempDir(), + ExternalURL: &url.URL{Path: "/am"}, + ShardingEnabled: false, + GCInterval: 30 * time.Minute, + }, reg) + require.NoError(t, err) + defer am.StopAndWait() + + t.Run("Test maxSilencesCount", func(t *testing.T) { + createSilences := func() *silencepb.Silence { + return &silencepb.Silence{ + Matchers: []*silencepb.Matcher{{Name: "name", Pattern: "pattern"}}, + StartsAt: time.Now(), + EndsAt: time.Now().Add(time.Minute * 30), + } + } + + // create silences up to maxSilencesCount + for i := 0; i < maxSilencesCount; i++ { + err := am.silences.Set(createSilences()) + require.NoError(t, err) + } + + // exceeds limit + err = am.silences.Set(createSilences()) + require.Error(t, err) + require.Equal(t, fmt.Sprintf("exceeded maximum number of silences: %d (limit: %d)", maxSilencesCount, maxSilencesCount), err.Error()) + + // expire whole silences + silences, _, err := am.silences.Query() + require.NoError(t, err) + for _, s := range silences { + err := am.silences.Expire(s.Id) + require.NoError(t, err) + } + + // check maxSilencesCount includes expired silences + err = am.silences.Set(createSilences()) + require.Error(t, err) + require.Equal(t, fmt.Sprintf("exceeded maximum number of silences: %d (limit: %d)", maxSilencesCount, maxSilencesCount), err.Error()) + + // GC + n, err := am.silences.GC() + require.NoError(t, err) + require.Equal(t, maxSilencesCount, n) + }) + t.Run("Test maxSilencesSizeBytes", func(t *testing.T) { + bigSilences := &silencepb.Silence{ + Matchers: []*silencepb.Matcher{{Name: strings.Repeat("a", maxSilencesSizeBytes/2+1), Pattern: strings.Repeat("b", maxSilencesSizeBytes/2+1)}}, + StartsAt: time.Now(), + EndsAt: time.Now().Add(time.Minute * 30), + } + + err = am.silences.Set(bigSilences) + require.Error(t, err) + require.True(t, strings.Contains(err.Error(), "silence exceeded maximum size")) + }) +} + func TestDispatcherGroupLimits(t *testing.T) { for name, tc := range map[string]struct { groups int diff --git a/pkg/alertmanager/multitenant.go b/pkg/alertmanager/multitenant.go index abf421335e..0081e9ab78 100644 --- a/pkg/alertmanager/multitenant.go +++ b/pkg/alertmanager/multitenant.go @@ -228,6 +228,12 @@ type Limits interface { // AlertmanagerMaxAlertsSizeBytes returns total max size of alerts that tenant can have active at the same time. 0 = no limit. // Size of the alert is computed from alert labels, annotations and generator URL. AlertmanagerMaxAlertsSizeBytes(tenant string) int + + // AlertmanagerMaxSilencesCount returns max number of silences that tenant can have, including expired silences. 0 = no limit. + AlertmanagerMaxSilencesCount(tenant string) int + + // AlertmanagerMaxSilenceSizeBytes returns the maximum size of an individual silence. 0 = no limit. + AlertmanagerMaxSilenceSizeBytes(tenant string) int } // A MultitenantAlertmanager manages Alertmanager instances for multiple diff --git a/pkg/alertmanager/multitenant_test.go b/pkg/alertmanager/multitenant_test.go index 6c9801ed61..4fc55df02b 100644 --- a/pkg/alertmanager/multitenant_test.go +++ b/pkg/alertmanager/multitenant_test.go @@ -1777,8 +1777,14 @@ func TestAlertmanager_StateReplicationWithSharding(t *testing.T) { amConfig.ShardingEnabled = true } + var limits validation.Limits + flagext.DefaultValues(&limits) + + overrides, err := validation.NewOverrides(limits, nil) + require.NoError(t, err) + reg := prometheus.NewPedanticRegistry() - am, err := createMultitenantAlertmanager(amConfig, nil, nil, mockStore, ringStore, nil, log.NewNopLogger(), reg) + am, err := createMultitenantAlertmanager(amConfig, nil, nil, mockStore, ringStore, overrides, log.NewNopLogger(), reg) require.NoError(t, err) defer services.StopAndAwaitTerminated(ctx, am) //nolint:errcheck @@ -1969,8 +1975,14 @@ func TestAlertmanager_StateReplicationWithSharding_InitialSyncFromPeers(t *testi amConfig.ShardingEnabled = true + var limits validation.Limits + flagext.DefaultValues(&limits) + + overrides, err := validation.NewOverrides(limits, nil) + require.NoError(t, err) + reg := prometheus.NewPedanticRegistry() - am, err := createMultitenantAlertmanager(amConfig, nil, nil, mockStore, ringStore, nil, log.NewNopLogger(), reg) + am, err := createMultitenantAlertmanager(amConfig, nil, nil, mockStore, ringStore, overrides, log.NewNopLogger(), reg) require.NoError(t, err) clientPool.setServer(amConfig.ShardingRing.InstanceAddr+":0", am) @@ -2285,6 +2297,8 @@ type mockAlertManagerLimits struct { maxDispatcherAggregationGroups int maxAlertsCount int maxAlertsSizeBytes int + maxSilencesCount int + maxSilencesSizeBytes int } func (m *mockAlertManagerLimits) AlertmanagerMaxConfigSize(tenant string) int { @@ -2326,3 +2340,11 @@ func (m *mockAlertManagerLimits) AlertmanagerMaxAlertsCount(_ string) int { func (m *mockAlertManagerLimits) AlertmanagerMaxAlertsSizeBytes(_ string) int { return m.maxAlertsSizeBytes } + +func (m *mockAlertManagerLimits) AlertmanagerMaxSilencesCount(_ string) int { + return m.maxSilencesCount +} + +func (m *mockAlertManagerLimits) AlertmanagerMaxSilenceSizeBytes(_ string) int { + return m.maxSilencesSizeBytes +} diff --git a/pkg/util/validation/limits.go b/pkg/util/validation/limits.go index 729f62566e..b39a71c2b0 100644 --- a/pkg/util/validation/limits.go +++ b/pkg/util/validation/limits.go @@ -218,6 +218,8 @@ type Limits struct { AlertmanagerMaxDispatcherAggregationGroups int `yaml:"alertmanager_max_dispatcher_aggregation_groups" json:"alertmanager_max_dispatcher_aggregation_groups"` AlertmanagerMaxAlertsCount int `yaml:"alertmanager_max_alerts_count" json:"alertmanager_max_alerts_count"` AlertmanagerMaxAlertsSizeBytes int `yaml:"alertmanager_max_alerts_size_bytes" json:"alertmanager_max_alerts_size_bytes"` + AlertmanagerMaxSilencesCount int `yaml:"alertmanager_max_silences_count" json:"alertmanager_max_silences_count"` + AlertmanagerMaxSilencesSizeBytes int `yaml:"alertmanager_max_silences_size_bytes" json:"alertmanager_max_silences_size_bytes"` DisabledRuleGroups DisabledRuleGroups `yaml:"disabled_rule_groups" json:"disabled_rule_groups" doc:"nocli|description=list of rule groups to disable"` } @@ -310,6 +312,8 @@ func (l *Limits) RegisterFlags(f *flag.FlagSet) { f.IntVar(&l.AlertmanagerMaxDispatcherAggregationGroups, "alertmanager.max-dispatcher-aggregation-groups", 0, "Maximum number of aggregation groups in Alertmanager's dispatcher that a tenant can have. Each active aggregation group uses single goroutine. When the limit is reached, dispatcher will not dispatch alerts that belong to additional aggregation groups, but existing groups will keep working properly. 0 = no limit.") f.IntVar(&l.AlertmanagerMaxAlertsCount, "alertmanager.max-alerts-count", 0, "Maximum number of alerts that a single user can have. Inserting more alerts will fail with a log message and metric increment. 0 = no limit.") f.IntVar(&l.AlertmanagerMaxAlertsSizeBytes, "alertmanager.max-alerts-size-bytes", 0, "Maximum total size of alerts that a single user can have, alert size is the sum of the bytes of its labels, annotations and generatorURL. Inserting more alerts will fail with a log message and metric increment. 0 = no limit.") + f.IntVar(&l.AlertmanagerMaxSilencesCount, "alertmanager.max-silences-count", 0, "Maximum number of silences that a single user can have, including expired silences. 0 = no limit.") + f.IntVar(&l.AlertmanagerMaxSilencesSizeBytes, "alertmanager.max-silences-size-bytes", 0, "Maximum size of individual silences that a single user can have. 0 = no limit.") } // Validate the limits config and returns an error if the validation @@ -971,6 +975,14 @@ func (o *Overrides) AlertmanagerMaxAlertsSizeBytes(userID string) int { return o.GetOverridesForUser(userID).AlertmanagerMaxAlertsSizeBytes } +func (o *Overrides) AlertmanagerMaxSilencesCount(userID string) int { + return o.GetOverridesForUser(userID).AlertmanagerMaxSilencesCount +} + +func (o *Overrides) AlertmanagerMaxSilenceSizeBytes(userID string) int { + return o.GetOverridesForUser(userID).AlertmanagerMaxSilencesSizeBytes +} + func (o *Overrides) DisabledRuleGroups(userID string) DisabledRuleGroups { if o.tenantLimits != nil { l := o.tenantLimits.ByUserID(userID)