Skip to content

Commit

Permalink
Add for tenant silences limit
Browse files Browse the repository at this point in the history
Signed-off-by: SungJin1212 <tjdwls1201@gmail.com>
  • Loading branch information
SungJin1212 committed Feb 26, 2025
1 parent fc6c40d commit b3898e8
Show file tree
Hide file tree
Showing 7 changed files with 129 additions and 4 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
* [FEATURE] Query Frontend: Add dynamic interval size for query splitting. This is enabled by configuring experimental flags `querier.max-shards-per-query` and/or `querier.max-fetched-data-duration-per-query`. The split interval size is dynamically increased to maintain a number of shards and total duration fetched below the configured values. #6458
* [FEATURE] Querier/Ruler: Add `query_partial_data` and `rules_partial_data` limits to allow queries/rules to be evaluated with data from a single zone, if other zones are not available. #6526
* [FEATURE] Update prometheus alertmanager version to v0.28.0 and add new integration msteamsv2, jira, and rocketchat. #6590
* [ENHANCEMENT] Alertmanager: Add new limits `-alertmanager.max-silences-count` and `-alertmanager.max-silences-size-bytes` for limiting silences per tenant. #6605
* [ENHANCEMENT] Add `compactor.auto-forget-delay` for compactor to auto forget compactors after X minutes without heartbeat. #6533
* [ENHANCEMENT] StoreGateway: Emit more histogram buckets on the `cortex_querier_storegateway_refetches_per_query` metric. #6570
* [ENHANCEMENT] Querier: Apply bytes limiter to LabelNames and LabelValuesForLabelNames. #6568
Expand Down
9 changes: 9 additions & 0 deletions docs/configuration/config-file-reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -3751,6 +3751,15 @@ query_rejection:
# CLI flag: -alertmanager.max-alerts-size-bytes
[alertmanager_max_alerts_size_bytes: <int> | default = 0]

# Maximum number of silences that a single user can have, including expired
# silences. 0 = no limit.
# CLI flag: -alertmanager.max-silences-count
[alertmanager_max_silences_count: <int> | default = 0]

# Maximum size of individual silences that a single user can have. 0 = no limit.
# CLI flag: -alertmanager.max-silences-size-bytes
[alertmanager_max_silences_size_bytes: <int> | default = 0]

# list of rule groups to disable
[disabled_rule_groups: <list of DisabledRuleGroup> | default = []]
```
Expand Down
9 changes: 7 additions & 2 deletions pkg/alertmanager/alertmanager.go
Original file line number Diff line number Diff line change
Expand Up @@ -228,11 +228,16 @@ func New(cfg *Config, reg *prometheus.Registry) (*Alertmanager, error) {
am.groupMarker = memMarker

silencesFile := filepath.Join(cfg.TenantDataDir, silencesSnapshot)

am.silences, err = silence.New(silence.Options{
SnapshotFile: silencesFile,
Retention: cfg.Retention,
Logger: util_log.GoKitLogToSlog(log.With(am.logger, "component", "silences")),
Metrics: am.registry,
Limits: silence.Limits{
MaxSilences: func() int { return cfg.Limits.AlertmanagerMaxSilencesCount(cfg.UserID) },
MaxSilenceSizeBytes: func() int { return cfg.Limits.AlertmanagerMaxSilenceSizeBytes(cfg.UserID) },
},
Logger: util_log.GoKitLogToSlog(log.With(am.logger, "component", "silences")),
Metrics: am.registry,
})
if err != nil {
return nil, fmt.Errorf("failed to create silences: %v", err)
Expand Down
70 changes: 70 additions & 0 deletions pkg/alertmanager/alertmanager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (

"github.com/go-kit/log"
"github.com/prometheus/alertmanager/config"
"github.com/prometheus/alertmanager/silence/silencepb"
"github.com/prometheus/alertmanager/types"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/testutil"
Expand All @@ -19,6 +20,75 @@ import (
"github.com/cortexproject/cortex/pkg/util/test"
)

func TestSilencesLimits(t *testing.T) {
user := "test"

reg := prometheus.NewPedanticRegistry()
maxSilencesCount := 3
maxSilencesSizeBytes := 500
am, err := New(&Config{
UserID: user,
Logger: log.NewNopLogger(),
Limits: &mockAlertManagerLimits{maxSilencesCount: maxSilencesCount, maxSilencesSizeBytes: maxSilencesSizeBytes},
TenantDataDir: t.TempDir(),
ExternalURL: &url.URL{Path: "/am"},
ShardingEnabled: false,
GCInterval: 30 * time.Minute,
}, reg)
require.NoError(t, err)
defer am.StopAndWait()

t.Run("Test maxSilencesCount", func(t *testing.T) {
createSilences := func() *silencepb.Silence {
return &silencepb.Silence{
Matchers: []*silencepb.Matcher{{Name: "name", Pattern: "pattern"}},
StartsAt: time.Now(),
EndsAt: time.Now().Add(time.Minute * 30),
}
}

// create silences up to maxSilencesCount
for i := 0; i < maxSilencesCount; i++ {
err := am.silences.Set(createSilences())
require.NoError(t, err)
}

// exceeds limit
err = am.silences.Set(createSilences())
require.Error(t, err)
require.Equal(t, fmt.Sprintf("exceeded maximum number of silences: %d (limit: %d)", maxSilencesCount, maxSilencesCount), err.Error())

// expire whole silences
silences, _, err := am.silences.Query()
require.NoError(t, err)
for _, s := range silences {
err := am.silences.Expire(s.Id)
require.NoError(t, err)
}

// check maxSilencesCount includes expired silences
err = am.silences.Set(createSilences())
require.Error(t, err)
require.Equal(t, fmt.Sprintf("exceeded maximum number of silences: %d (limit: %d)", maxSilencesCount, maxSilencesCount), err.Error())

// GC
n, err := am.silences.GC()
require.NoError(t, err)
require.Equal(t, maxSilencesCount, n)
})
t.Run("Test maxSilencesSizeBytes", func(t *testing.T) {
bigSilences := &silencepb.Silence{
Matchers: []*silencepb.Matcher{{Name: strings.Repeat("a", maxSilencesSizeBytes/2+1), Pattern: strings.Repeat("b", maxSilencesSizeBytes/2+1)}},
StartsAt: time.Now(),
EndsAt: time.Now().Add(time.Minute * 30),
}

err = am.silences.Set(bigSilences)
require.Error(t, err)
require.True(t, strings.Contains(err.Error(), "silence exceeded maximum size"))
})
}

func TestDispatcherGroupLimits(t *testing.T) {
for name, tc := range map[string]struct {
groups int
Expand Down
6 changes: 6 additions & 0 deletions pkg/alertmanager/multitenant.go
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,12 @@ type Limits interface {
// AlertmanagerMaxAlertsSizeBytes returns total max size of alerts that tenant can have active at the same time. 0 = no limit.
// Size of the alert is computed from alert labels, annotations and generator URL.
AlertmanagerMaxAlertsSizeBytes(tenant string) int

// AlertmanagerMaxSilencesCount returns max number of silences that tenant can have, including expired silences. 0 = no limit.
AlertmanagerMaxSilencesCount(tenant string) int

// AlertmanagerMaxSilenceSizeBytes returns the maximum size of an individual silence. 0 = no limit.
AlertmanagerMaxSilenceSizeBytes(tenant string) int
}

// A MultitenantAlertmanager manages Alertmanager instances for multiple
Expand Down
26 changes: 24 additions & 2 deletions pkg/alertmanager/multitenant_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1777,8 +1777,14 @@ func TestAlertmanager_StateReplicationWithSharding(t *testing.T) {
amConfig.ShardingEnabled = true
}

var limits validation.Limits
flagext.DefaultValues(&limits)

overrides, err := validation.NewOverrides(limits, nil)
require.NoError(t, err)

reg := prometheus.NewPedanticRegistry()
am, err := createMultitenantAlertmanager(amConfig, nil, nil, mockStore, ringStore, nil, log.NewNopLogger(), reg)
am, err := createMultitenantAlertmanager(amConfig, nil, nil, mockStore, ringStore, overrides, log.NewNopLogger(), reg)
require.NoError(t, err)
defer services.StopAndAwaitTerminated(ctx, am) //nolint:errcheck

Expand Down Expand Up @@ -1969,8 +1975,14 @@ func TestAlertmanager_StateReplicationWithSharding_InitialSyncFromPeers(t *testi

amConfig.ShardingEnabled = true

var limits validation.Limits
flagext.DefaultValues(&limits)

overrides, err := validation.NewOverrides(limits, nil)
require.NoError(t, err)

reg := prometheus.NewPedanticRegistry()
am, err := createMultitenantAlertmanager(amConfig, nil, nil, mockStore, ringStore, nil, log.NewNopLogger(), reg)
am, err := createMultitenantAlertmanager(amConfig, nil, nil, mockStore, ringStore, overrides, log.NewNopLogger(), reg)
require.NoError(t, err)

clientPool.setServer(amConfig.ShardingRing.InstanceAddr+":0", am)
Expand Down Expand Up @@ -2285,6 +2297,8 @@ type mockAlertManagerLimits struct {
maxDispatcherAggregationGroups int
maxAlertsCount int
maxAlertsSizeBytes int
maxSilencesCount int
maxSilencesSizeBytes int
}

func (m *mockAlertManagerLimits) AlertmanagerMaxConfigSize(tenant string) int {
Expand Down Expand Up @@ -2326,3 +2340,11 @@ func (m *mockAlertManagerLimits) AlertmanagerMaxAlertsCount(_ string) int {
func (m *mockAlertManagerLimits) AlertmanagerMaxAlertsSizeBytes(_ string) int {
return m.maxAlertsSizeBytes
}

func (m *mockAlertManagerLimits) AlertmanagerMaxSilencesCount(_ string) int {
return m.maxSilencesCount
}

func (m *mockAlertManagerLimits) AlertmanagerMaxSilenceSizeBytes(_ string) int {
return m.maxSilencesSizeBytes
}
12 changes: 12 additions & 0 deletions pkg/util/validation/limits.go
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,8 @@ type Limits struct {
AlertmanagerMaxDispatcherAggregationGroups int `yaml:"alertmanager_max_dispatcher_aggregation_groups" json:"alertmanager_max_dispatcher_aggregation_groups"`
AlertmanagerMaxAlertsCount int `yaml:"alertmanager_max_alerts_count" json:"alertmanager_max_alerts_count"`
AlertmanagerMaxAlertsSizeBytes int `yaml:"alertmanager_max_alerts_size_bytes" json:"alertmanager_max_alerts_size_bytes"`
AlertmanagerMaxSilencesCount int `yaml:"alertmanager_max_silences_count" json:"alertmanager_max_silences_count"`
AlertmanagerMaxSilencesSizeBytes int `yaml:"alertmanager_max_silences_size_bytes" json:"alertmanager_max_silences_size_bytes"`
DisabledRuleGroups DisabledRuleGroups `yaml:"disabled_rule_groups" json:"disabled_rule_groups" doc:"nocli|description=list of rule groups to disable"`
}

Expand Down Expand Up @@ -310,6 +312,8 @@ func (l *Limits) RegisterFlags(f *flag.FlagSet) {
f.IntVar(&l.AlertmanagerMaxDispatcherAggregationGroups, "alertmanager.max-dispatcher-aggregation-groups", 0, "Maximum number of aggregation groups in Alertmanager's dispatcher that a tenant can have. Each active aggregation group uses single goroutine. When the limit is reached, dispatcher will not dispatch alerts that belong to additional aggregation groups, but existing groups will keep working properly. 0 = no limit.")
f.IntVar(&l.AlertmanagerMaxAlertsCount, "alertmanager.max-alerts-count", 0, "Maximum number of alerts that a single user can have. Inserting more alerts will fail with a log message and metric increment. 0 = no limit.")
f.IntVar(&l.AlertmanagerMaxAlertsSizeBytes, "alertmanager.max-alerts-size-bytes", 0, "Maximum total size of alerts that a single user can have, alert size is the sum of the bytes of its labels, annotations and generatorURL. Inserting more alerts will fail with a log message and metric increment. 0 = no limit.")
f.IntVar(&l.AlertmanagerMaxSilencesCount, "alertmanager.max-silences-count", 0, "Maximum number of silences that a single user can have, including expired silences. 0 = no limit.")
f.IntVar(&l.AlertmanagerMaxSilencesSizeBytes, "alertmanager.max-silences-size-bytes", 0, "Maximum size of individual silences that a single user can have. 0 = no limit.")
}

// Validate the limits config and returns an error if the validation
Expand Down Expand Up @@ -971,6 +975,14 @@ func (o *Overrides) AlertmanagerMaxAlertsSizeBytes(userID string) int {
return o.GetOverridesForUser(userID).AlertmanagerMaxAlertsSizeBytes
}

func (o *Overrides) AlertmanagerMaxSilencesCount(userID string) int {
return o.GetOverridesForUser(userID).AlertmanagerMaxSilencesCount
}

func (o *Overrides) AlertmanagerMaxSilenceSizeBytes(userID string) int {
return o.GetOverridesForUser(userID).AlertmanagerMaxSilencesSizeBytes
}

func (o *Overrides) DisabledRuleGroups(userID string) DisabledRuleGroups {
if o.tenantLimits != nil {
l := o.tenantLimits.ByUserID(userID)
Expand Down

0 comments on commit b3898e8

Please sign in to comment.