2021-09-16 22:33:51 +08:00
package setting
import (
2022-02-12 05:13:49 +08:00
"fmt"
2021-09-29 22:16:40 +08:00
"strconv"
2021-09-16 22:33:51 +08:00
"strings"
"time"
2024-05-14 21:21:42 +08:00
dstls "github.com/grafana/dskit/crypto/tls"
2021-09-21 19:08:52 +08:00
"github.com/grafana/grafana-plugin-sdk-go/backend/gtime"
2021-09-16 22:33:51 +08:00
"gopkg.in/ini.v1"
2023-01-30 16:26:42 +08:00
2024-06-21 04:33:03 +08:00
alertingCluster "github.com/grafana/alerting/cluster"
2023-01-30 16:26:42 +08:00
"github.com/grafana/grafana/pkg/util"
2021-09-16 22:33:51 +08:00
)
const (
2021-09-28 18:00:16 +08:00
alertmanagerDefaultClusterAddr = "0.0.0.0:9094"
alertmanagerDefaultPeerTimeout = 15 * time . Second
2023-12-21 19:34:48 +08:00
alertmanagerDefaultGossipInterval = alertingCluster . DefaultGossipInterval
2024-06-12 01:25:48 +08:00
alertmanagerDefaultReconnectTimeout = alertingCluster . DefaultReconnectTimeout
2023-12-21 19:34:48 +08:00
alertmanagerDefaultPushPullInterval = alertingCluster . DefaultPushPullInterval
2022-11-22 15:09:15 +08:00
alertmanagerDefaultConfigPollInterval = time . Minute
2023-08-30 03:59:12 +08:00
alertmanagerRedisDefaultMaxConns = 5
2021-09-24 01:52:20 +08:00
// To start, the alertmanager needs at least one route defined.
// TODO: we should move this to Grafana settings and define this as the default.
2021-09-28 18:00:16 +08:00
alertmanagerDefaultConfiguration = ` {
2021-09-24 01:52:20 +08:00
"alertmanager_config" : {
"route" : {
2022-07-12 00:24:43 +08:00
"receiver" : "grafana-default-email" ,
"group_by" : [ "grafana_folder" , "alertname" ]
2021-09-24 01:52:20 +08:00
} ,
"receivers" : [ {
"name" : "grafana-default-email" ,
"grafana_managed_receiver_configs" : [ {
"uid" : "" ,
"name" : "email receiver" ,
"type" : "email" ,
"settings" : {
"addresses" : "<example@email.com>"
}
} ]
} ]
}
}
`
2024-11-08 02:23:55 +08:00
alertingDefaultInitializationTimeout = 30 * time . Second
2021-09-28 18:00:16 +08:00
evaluatorDefaultEvaluationTimeout = 30 * time . Second
2025-05-13 19:25:56 +08:00
remoteAlertmanagerDefaultTimeout = 30 * time . Second
2022-11-22 15:09:15 +08:00
schedulerDefaultAdminConfigPollInterval = time . Minute
2024-03-08 05:01:11 +08:00
schedulerDefaultExecuteAlerts = true
2024-12-06 04:48:24 +08:00
schedulerDefaultMaxAttempts = 3
2025-09-04 19:56:03 +08:00
schedulerDefaultInitialRetryDelay = 1 * time . Second
schedulerDefaultMaxRetryDelay = 4 * time . Second
schedulerDefaultRandomizationFactor = 0.1
2021-09-28 18:00:16 +08:00
schedulerDefaultLegacyMinInterval = 1
2022-06-08 11:04:51 +08:00
screenshotsDefaultCapture = false
2023-01-06 00:07:46 +08:00
screenshotsDefaultCaptureTimeout = 10 * time . Second
screenshotsMaxCaptureTimeout = 30 * time . Second
2022-05-22 22:33:49 +08:00
screenshotsDefaultMaxConcurrent = 5
screenshotsDefaultUploadImageStorage = false
2022-02-12 05:13:49 +08:00
// SchedulerBaseInterval base interval of the scheduler. Controls how often the scheduler fetches database for new changes as well as schedules evaluation of a rule
// changing this value is discouraged because this could cause existing alert definition
// with intervals that are not exactly divided by this number not to be evaluated
SchedulerBaseInterval = 10 * time . Second
2022-02-18 23:05:06 +08:00
// DefaultRuleEvaluationInterval indicates a default interval of for how long a rule should be evaluated to change state from Pending to Alerting
2025-06-18 13:17:57 +08:00
DefaultRuleEvaluationInterval = SchedulerBaseInterval * 6 // == 60 seconds
stateHistoryDefaultEnabled = true
2025-07-17 21:26:26 +08:00
notificationHistoryDefaultEnabled = false
2025-06-18 13:17:57 +08:00
lokiDefaultMaxQueryLength = 721 * time . Hour // 30d1h, matches the default value in Loki
defaultRecordingRequestTimeout = 10 * time . Second
lokiDefaultMaxQuerySize = 65536 // 64kb
defaultHistorianPrometheusWriteTimeout = 10 * time . Second
2025-06-21 00:14:36 +08:00
defaultHistorianPrometheusMetricName = "GRAFANA_ALERTS"
2021-09-16 22:33:51 +08:00
)
2025-06-05 22:02:40 +08:00
var (
errHARedisBothClusterAndSentinel = fmt . Errorf ( "'ha_redis_cluster_mode_enabled' and 'ha_redis_sentinel_mode_enabled' are mutually exclusive" )
errHARedisSentinelMasterNameRequired = fmt . Errorf ( "'ha_redis_sentinel_master_name' is required when 'ha_redis_sentinel_mode_enabled' is true" )
)
2021-09-20 15:12:21 +08:00
type UnifiedAlertingSettings struct {
2024-07-24 20:22:29 +08:00
AdminConfigPollInterval time . Duration
AlertmanagerConfigPollInterval time . Duration
AlertmanagerMaxSilenceSizeBytes int
AlertmanagerMaxSilencesCount int
HAListenAddr string
HAAdvertiseAddr string
HAPeers [ ] string
HAPeerTimeout time . Duration
HAGossipInterval time . Duration
HAReconnectTimeout time . Duration
HAPushPullInterval time . Duration
HALabel string
HARedisClusterModeEnabled bool
2025-06-05 22:02:40 +08:00
HARedisSentinelModeEnabled bool
HARedisSentinelMasterName string
HARedisSentinelUsername string
HARedisSentinelPassword string
2024-07-24 20:22:29 +08:00
HARedisAddr string
HARedisPeerName string
HARedisPrefix string
HARedisUsername string
HARedisPassword string
HARedisDB int
HARedisMaxConns int
HARedisTLSEnabled bool
HARedisTLSConfig dstls . ClientConfig
2024-11-08 02:23:55 +08:00
InitializationTimeout time . Duration
2024-07-24 20:22:29 +08:00
MaxAttempts int64
2025-09-04 19:56:03 +08:00
InitialRetryDelay time . Duration
MaxRetryDelay time . Duration
RandomizationFactor float64
2024-07-24 20:22:29 +08:00
MinInterval time . Duration
EvaluationTimeout time . Duration
EvaluationResultLimit int
DisableJitter bool
ExecuteAlerts bool
DefaultConfiguration string
Enabled * bool // determines whether unified alerting is enabled. If it is nil then user did not define it and therefore its value will be determined during migration. Services should not use it directly.
DisabledOrgs map [ int64 ] struct { }
2022-02-12 05:13:49 +08:00
// BaseInterval interval of time the scheduler updates the rules and evaluates rules.
// Only for internal use and not user configuration.
BaseInterval time . Duration
2022-02-18 23:05:06 +08:00
// DefaultRuleEvaluationInterval default interval between evaluations of a rule.
DefaultRuleEvaluationInterval time . Duration
2022-05-22 22:33:49 +08:00
Screenshots UnifiedAlertingScreenshotSettings
2022-07-12 00:41:40 +08:00
ReservedLabels UnifiedAlertingReservedLabelSettings
2023-01-06 02:21:07 +08:00
StateHistory UnifiedAlertingStateHistorySettings
2025-07-17 21:26:26 +08:00
NotificationHistory UnifiedAlertingNotificationHistorySettings
2023-09-05 22:24:35 +08:00
RemoteAlertmanager RemoteAlertmanagerSettings
2024-06-13 04:04:46 +08:00
RecordingRules RecordingRuleSettings
2025-03-20 22:31:21 +08:00
PrometheusConversion UnifiedAlertingPrometheusConversionSettings
2024-06-13 04:04:46 +08:00
2023-06-23 18:36:07 +08:00
// MaxStateSaveConcurrency controls the number of goroutines (per rule) that can save alert state in parallel.
Alerting: Add jitter support for periodic alert state storage to reduce database load spikes (#111357)
What is this feature?
This PR implements a jitter mechanism for periodic alert state storage to distribute database load over time instead of processing all alert instances simultaneously. When enabled via the state_periodic_save_jitter_enabled configuration option, the system spreads batch write operations across 85% of the save interval window, preventing database load spikes in high-cardinality alerting environments.
Why do we need this feature?
In production environments with high alert cardinality, the current periodic batch storage can cause database performance issues by processing all alert instances simultaneously at fixed intervals. Even when using periodic batch storage to improve performance, concentrating all database operations at a single point in time can overwhelm database resources, especially in resource-constrained environments.
Rather than performing all INSERT operations at once during the periodic save, distributing these operations across the time window until the next save cycle can maintain more stable service operation within limited database resources. This approach prevents resource saturation by spreading the database load over the available time interval, allowing the system to operate more gracefully within existing resource constraints.
For example, with 200,000 alert instances using a 5-minute interval and 4,000 batch size, instead of executing 50 batch operations simultaneously, the jitter mechanism distributes these operations across approximately 4.25 minutes (85% of 5 minutes), with each batch executed roughly every 5.2 seconds.
This PR provides system-level protection against such load spikes by distributing operations across time, reducing peak resource usage while maintaining the benefits of periodic batch storage. The jitter mechanism is particularly valuable in resource-constrained environments where maintaining consistent database performance is more critical than precise timing of state updates.
2025-09-29 17:22:36 +08:00
MaxStateSaveConcurrency int
StatePeriodicSaveInterval time . Duration
StatePeriodicSaveBatchSize int
StatePeriodicSaveJitterEnabled bool
RulesPerRuleGroupLimit int64
2024-04-05 18:25:43 +08:00
// Retention period for Alertmanager notification log entries.
NotificationLogRetention time . Duration
2024-06-21 04:33:03 +08:00
// Duration for which a resolved alert state transition will continue to be sent to the Alertmanager.
ResolvedAlertRetention time . Duration
2024-10-05 05:31:21 +08:00
// RuleVersionRecordLimit defines the limit of how many alert rule versions
// should be stored in the database for each alert_rule in an organization including the current one.
// 0 value means no limit
RuleVersionRecordLimit int
2025-03-12 04:58:26 +08:00
// DeletedRuleRetention defines the maximum duration to retain deleted alerting rules before permanent removal.
DeletedRuleRetention time . Duration
2022-05-22 22:33:49 +08:00
}
2024-06-13 04:04:46 +08:00
type RecordingRuleSettings struct {
2025-03-11 20:45:16 +08:00
Enabled bool
CustomHeaders map [ string ] string
Timeout time . Duration
DefaultDatasourceUID string
2024-06-13 04:04:46 +08:00
}
2023-09-05 22:24:35 +08:00
// RemoteAlertmanagerSettings contains the configuration needed
// to disable the internal Alertmanager and use an external one instead.
type RemoteAlertmanagerSettings struct {
2023-12-21 22:26:31 +08:00
URL string
TenantID string
Password string
SyncInterval time . Duration
2025-05-13 19:25:56 +08:00
Timeout time . Duration
2023-09-05 22:24:35 +08:00
}
2022-05-22 22:33:49 +08:00
type UnifiedAlertingScreenshotSettings struct {
2022-06-08 11:04:51 +08:00
Capture bool
2023-01-06 00:07:46 +08:00
CaptureTimeout time . Duration
2022-05-22 22:33:49 +08:00
MaxConcurrentScreenshots int64
UploadExternalImageStorage bool
2021-09-20 15:12:21 +08:00
}
2022-07-12 00:41:40 +08:00
type UnifiedAlertingReservedLabelSettings struct {
DisabledLabels map [ string ] struct { }
}
2025-03-20 22:31:21 +08:00
// UnifiedAlertingPrometheusConversionSettings contains configuration for converting Prometheus rules to Grafana format
type UnifiedAlertingPrometheusConversionSettings struct {
// RuleQueryOffset defines a time offset to apply to rule queries during conversion from Prometheus to Grafana format
RuleQueryOffset time . Duration
}
2025-07-17 21:26:26 +08:00
type UnifiedAlertingLokiSettings struct {
2023-01-18 03:58:52 +08:00
LokiRemoteURL string
2023-01-31 06:30:05 +08:00
LokiReadURL string
LokiWriteURL string
2023-01-19 03:24:40 +08:00
LokiTenantID string
// LokiBasicAuthUsername and LokiBasicAuthPassword are used for basic auth
// if one of them is set.
2025-07-17 21:26:26 +08:00
LokiBasicAuthPassword string
LokiBasicAuthUsername string
LokiMaxQueryLength time . Duration
LokiMaxQuerySize int
ExternalLabels map [ string ] string
}
type UnifiedAlertingStateHistorySettings struct {
Enabled bool
Backend string
LokiSettings UnifiedAlertingLokiSettings
2025-06-18 13:17:57 +08:00
PrometheusMetricName string
PrometheusTargetDatasourceUID string
PrometheusWriteTimeout time . Duration
MultiPrimary string
MultiSecondaries [ ] string
ExternalLabels map [ string ] string
2023-01-06 02:21:07 +08:00
}
2025-07-17 21:26:26 +08:00
type UnifiedAlertingNotificationHistorySettings struct {
Enabled bool
LokiSettings UnifiedAlertingLokiSettings
}
2021-11-25 03:56:07 +08:00
// IsEnabled returns true if UnifiedAlertingSettings.Enabled is either nil or true.
// It hides the implementation details of the Enabled and simplifies its usage.
func ( u * UnifiedAlertingSettings ) IsEnabled ( ) bool {
return u . Enabled == nil || * u . Enabled
}
2022-07-12 00:41:40 +08:00
// IsReservedLabelDisabled returns true if UnifiedAlertingReservedLabelSettings.DisabledLabels contains the given reserved label.
func ( u * UnifiedAlertingReservedLabelSettings ) IsReservedLabelDisabled ( label string ) bool {
_ , ok := u . DisabledLabels [ label ]
return ok
}
2022-05-30 23:47:15 +08:00
// readUnifiedAlertingEnabledSettings reads the settings for unified alerting.
// It returns a non-nil bool and a nil error when unified alerting is enabled either
// because it has been enabled in the settings or by default. It returns nil and
// a non-nil error both unified alerting and legacy alerting are enabled at the same time.
2021-11-25 03:56:07 +08:00
func ( cfg * Cfg ) readUnifiedAlertingEnabledSetting ( section * ini . Section ) ( * bool , error ) {
2022-05-30 23:47:15 +08:00
// At present an invalid value is considered the same as no value. This means that a
// spelling mistake in the string "false" could enable unified alerting rather
// than disable it. This issue can be found here
2024-03-08 05:01:11 +08:00
if section . Key ( "enabled" ) . Value ( ) == "" {
return util . Pointer ( true ) , nil
2021-11-25 03:56:07 +08:00
}
2022-06-10 15:59:58 +08:00
unifiedAlerting , err := section . Key ( "enabled" ) . Bool ( )
if err != nil {
return nil , fmt . Errorf ( "invalid value %s, should be either true or false" , section . Key ( "enabled" ) )
}
2022-05-30 23:47:15 +08:00
return & unifiedAlerting , nil
2021-11-25 03:56:07 +08:00
}
2021-09-28 18:00:16 +08:00
// ReadUnifiedAlertingSettings reads both the `unified_alerting` and `alerting` sections of the configuration while preferring configuration the `alerting` section.
// It first reads the `unified_alerting` section, then looks for non-defaults on the `alerting` section and prefers those.
2024-04-05 18:25:43 +08:00
//
// nolint: gocyclo
2021-09-16 22:33:51 +08:00
func ( cfg * Cfg ) ReadUnifiedAlertingSettings ( iniFile * ini . File ) error {
2021-11-25 03:56:07 +08:00
var err error
2021-09-20 15:12:21 +08:00
uaCfg := UnifiedAlertingSettings { }
2021-09-16 22:33:51 +08:00
ua := iniFile . Section ( "unified_alerting" )
2021-11-25 03:56:07 +08:00
uaCfg . Enabled , err = cfg . readUnifiedAlertingEnabledSetting ( ua )
if err != nil {
2022-06-10 15:59:58 +08:00
return fmt . Errorf ( "failed to read unified alerting enabled setting: %w" , err )
2021-09-29 22:16:40 +08:00
}
uaCfg . DisabledOrgs = make ( map [ int64 ] struct { } )
orgsStr := valueAsString ( ua , "disabled_orgs" , "" )
for _ , org := range util . SplitString ( orgsStr ) {
orgID , err := strconv . ParseInt ( org , 10 , 64 )
if err != nil {
return err
}
uaCfg . DisabledOrgs [ orgID ] = struct { } { }
}
2024-11-08 02:23:55 +08:00
uaCfg . InitializationTimeout , err = gtime . ParseDuration ( valueAsString ( ua , "initialization_timeout" , ( alertingDefaultInitializationTimeout ) . String ( ) ) )
if err != nil {
return err
}
2021-09-28 18:00:16 +08:00
uaCfg . AdminConfigPollInterval , err = gtime . ParseDuration ( valueAsString ( ua , "admin_config_poll_interval" , ( schedulerDefaultAdminConfigPollInterval ) . String ( ) ) )
2021-09-16 22:33:51 +08:00
if err != nil {
return err
}
2021-09-28 18:00:16 +08:00
uaCfg . AlertmanagerConfigPollInterval , err = gtime . ParseDuration ( valueAsString ( ua , "alertmanager_config_poll_interval" , ( alertmanagerDefaultConfigPollInterval ) . String ( ) ) )
2021-09-16 22:33:51 +08:00
if err != nil {
return err
}
2024-07-24 20:22:29 +08:00
uaCfg . AlertmanagerMaxSilenceSizeBytes = ua . Key ( "alertmanager_max_silence_size_bytes" ) . MustInt ( 0 )
uaCfg . AlertmanagerMaxSilencesCount = ua . Key ( "alertmanager_max_silences_count" ) . MustInt ( 0 )
2021-09-28 18:00:16 +08:00
uaCfg . HAPeerTimeout , err = gtime . ParseDuration ( valueAsString ( ua , "ha_peer_timeout" , ( alertmanagerDefaultPeerTimeout ) . String ( ) ) )
2021-09-16 22:33:51 +08:00
if err != nil {
return err
}
2021-09-28 18:00:16 +08:00
uaCfg . HAGossipInterval , err = gtime . ParseDuration ( valueAsString ( ua , "ha_gossip_interval" , ( alertmanagerDefaultGossipInterval ) . String ( ) ) )
2021-09-16 22:33:51 +08:00
if err != nil {
return err
}
2024-06-12 01:25:48 +08:00
uaCfg . HAReconnectTimeout , err = gtime . ParseDuration ( valueAsString ( ua , "ha_reconnect_timeout" , ( alertmanagerDefaultReconnectTimeout ) . String ( ) ) )
if err != nil {
return err
}
2021-09-28 18:00:16 +08:00
uaCfg . HAPushPullInterval , err = gtime . ParseDuration ( valueAsString ( ua , "ha_push_pull_interval" , ( alertmanagerDefaultPushPullInterval ) . String ( ) ) )
2021-09-16 22:33:51 +08:00
if err != nil {
return err
}
2021-09-28 18:00:16 +08:00
uaCfg . HAListenAddr = ua . Key ( "ha_listen_address" ) . MustString ( alertmanagerDefaultClusterAddr )
2021-09-20 15:12:21 +08:00
uaCfg . HAAdvertiseAddr = ua . Key ( "ha_advertise_address" ) . MustString ( "" )
2023-05-09 16:32:23 +08:00
uaCfg . HALabel = ua . Key ( "ha_label" ) . MustString ( "" )
2024-06-06 00:02:25 +08:00
uaCfg . HARedisClusterModeEnabled = ua . Key ( "ha_redis_cluster_mode_enabled" ) . MustBool ( false )
2025-06-05 22:02:40 +08:00
uaCfg . HARedisSentinelModeEnabled = ua . Key ( "ha_redis_sentinel_mode_enabled" ) . MustBool ( false )
if uaCfg . HARedisClusterModeEnabled && uaCfg . HARedisSentinelModeEnabled {
return errHARedisBothClusterAndSentinel
}
uaCfg . HARedisSentinelMasterName = ua . Key ( "ha_redis_sentinel_master_name" ) . MustString ( "" )
if uaCfg . HARedisSentinelModeEnabled && uaCfg . HARedisSentinelMasterName == "" {
return errHARedisSentinelMasterNameRequired
}
uaCfg . HARedisSentinelUsername = ua . Key ( "ha_redis_sentinel_username" ) . MustString ( "" )
uaCfg . HARedisSentinelPassword = ua . Key ( "ha_redis_sentinel_password" ) . MustString ( "" )
2023-04-19 23:05:26 +08:00
uaCfg . HARedisAddr = ua . Key ( "ha_redis_address" ) . MustString ( "" )
uaCfg . HARedisPeerName = ua . Key ( "ha_redis_peer_name" ) . MustString ( "" )
uaCfg . HARedisPrefix = ua . Key ( "ha_redis_prefix" ) . MustString ( "" )
uaCfg . HARedisUsername = ua . Key ( "ha_redis_username" ) . MustString ( "" )
uaCfg . HARedisPassword = ua . Key ( "ha_redis_password" ) . MustString ( "" )
uaCfg . HARedisDB = ua . Key ( "ha_redis_db" ) . MustInt ( 0 )
2023-08-30 03:59:12 +08:00
uaCfg . HARedisMaxConns = ua . Key ( "ha_redis_max_conns" ) . MustInt ( alertmanagerRedisDefaultMaxConns )
2021-09-16 22:33:51 +08:00
peers := ua . Key ( "ha_peers" ) . MustString ( "" )
2021-09-20 15:12:21 +08:00
uaCfg . HAPeers = make ( [ ] string , 0 )
2021-09-16 22:33:51 +08:00
if peers != "" {
for _ , peer := range strings . Split ( peers , "," ) {
peer = strings . TrimSpace ( peer )
2021-09-20 15:12:21 +08:00
uaCfg . HAPeers = append ( uaCfg . HAPeers , peer )
2021-09-16 22:33:51 +08:00
}
}
2024-05-14 21:21:42 +08:00
uaCfg . HARedisTLSEnabled = ua . Key ( "ha_redis_tls_enabled" ) . MustBool ( false )
uaCfg . HARedisTLSConfig . CertPath = ua . Key ( "ha_redis_tls_cert_path" ) . MustString ( "" )
uaCfg . HARedisTLSConfig . KeyPath = ua . Key ( "ha_redis_tls_key_path" ) . MustString ( "" )
uaCfg . HARedisTLSConfig . CAPath = ua . Key ( "ha_redis_tls_ca_path" ) . MustString ( "" )
uaCfg . HARedisTLSConfig . ServerName = ua . Key ( "ha_redis_tls_server_name" ) . MustString ( "" )
uaCfg . HARedisTLSConfig . InsecureSkipVerify = ua . Key ( "ha_redis_tls_insecure_skip_verify" ) . MustBool ( false )
uaCfg . HARedisTLSConfig . CipherSuites = ua . Key ( "ha_redis_tls_cipher_suites" ) . MustString ( "" )
uaCfg . HARedisTLSConfig . MinVersion = ua . Key ( "ha_redis_tls_min_version" ) . MustString ( "" )
2021-09-28 18:00:16 +08:00
2021-09-24 01:52:20 +08:00
// TODO load from ini file
2021-09-28 18:00:16 +08:00
uaCfg . DefaultConfiguration = alertmanagerDefaultConfiguration
alerting := iniFile . Section ( "alerting" )
2024-03-08 05:01:11 +08:00
uaExecuteAlerts := ua . Key ( "execute_alerts" ) . MustBool ( schedulerDefaultExecuteAlerts )
2021-09-28 18:00:16 +08:00
if uaExecuteAlerts { // unified option equals the default (true)
2024-03-08 05:01:11 +08:00
legacyExecuteAlerts := alerting . Key ( "execute_alerts" ) . MustBool ( schedulerDefaultExecuteAlerts )
2021-09-28 18:00:16 +08:00
if ! legacyExecuteAlerts {
cfg . Logger . Warn ( "falling back to legacy setting of 'execute_alerts'; please use the configuration option in the `unified_alerting` section if Grafana 8 alerts are enabled." )
}
uaExecuteAlerts = legacyExecuteAlerts
}
uaCfg . ExecuteAlerts = uaExecuteAlerts
// if the unified alerting options equal the defaults, apply the respective legacy one
uaEvaluationTimeout , err := gtime . ParseDuration ( valueAsString ( ua , "evaluation_timeout" , evaluatorDefaultEvaluationTimeout . String ( ) ) )
if err != nil || uaEvaluationTimeout == evaluatorDefaultEvaluationTimeout { // unified option is invalid duration or equals the default
legaceEvaluationTimeout := time . Duration ( alerting . Key ( "evaluation_timeout_seconds" ) . MustInt64 ( int64 ( evaluatorDefaultEvaluationTimeout . Seconds ( ) ) ) ) * time . Second
if legaceEvaluationTimeout != evaluatorDefaultEvaluationTimeout {
cfg . Logger . Warn ( "falling back to legacy setting of 'evaluation_timeout_seconds'; please use the configuration option in the `unified_alerting` section if Grafana 8 alerts are enabled." )
}
uaEvaluationTimeout = legaceEvaluationTimeout
}
uaCfg . EvaluationTimeout = uaEvaluationTimeout
2023-12-06 01:42:34 +08:00
uaCfg . MaxAttempts = ua . Key ( "max_attempts" ) . MustInt64 ( schedulerDefaultMaxAttempts )
2021-09-28 18:00:16 +08:00
2025-09-04 19:56:03 +08:00
uaInitialRetryDelay , err := gtime . ParseDuration ( valueAsString ( ua , "initial_retry_delay" , schedulerDefaultInitialRetryDelay . String ( ) ) )
if err != nil {
cfg . Logger . Warn ( "failed to parse setting 'initial_retry_delay' as duration, falling back to the default value" , "error" , err , "default" , schedulerDefaultInitialRetryDelay )
uaInitialRetryDelay = schedulerDefaultInitialRetryDelay
}
uaCfg . InitialRetryDelay = uaInitialRetryDelay
uaMaxRetryDelay , err := gtime . ParseDuration ( valueAsString ( ua , "max_retry_delay" , schedulerDefaultMaxRetryDelay . String ( ) ) )
if err != nil {
cfg . Logger . Warn ( "failed to parse setting 'max_retry_delay' as duration, falling back to the default value" , "error" , err , "default" , schedulerDefaultMaxRetryDelay )
uaMaxRetryDelay = schedulerDefaultMaxRetryDelay
}
uaCfg . MaxRetryDelay = uaMaxRetryDelay
uaRandomizationFactor := ua . Key ( "randomization_factor" ) . MustFloat64 ( schedulerDefaultRandomizationFactor )
if uaRandomizationFactor < 0 || uaRandomizationFactor > 1 {
cfg . Logger . Warn ( "randomization_factor must be between 0 and 1, falling back to the default value" , "value" , uaRandomizationFactor , "default" , schedulerDefaultRandomizationFactor )
uaRandomizationFactor = schedulerDefaultRandomizationFactor
}
uaCfg . RandomizationFactor = uaRandomizationFactor
2022-02-12 05:13:49 +08:00
uaCfg . BaseInterval = SchedulerBaseInterval
2024-02-10 05:53:58 +08:00
// TODO: This was promoted from a feature toggle and is now the default behavior.
// We can consider removing the knob entirely in a release after 10.4.
uaCfg . DisableJitter = ua . Key ( "disable_jitter" ) . MustBool ( false )
2023-07-27 00:44:12 +08:00
// The base interval of the scheduler for evaluating alerts.
// 1. It is used by the internal scheduler's timer to tick at this interval.
// 2. to spread evaluations of rules that need to be evaluated at the current tick T. In other words, the evaluation of rules at the tick T will be evenly spread in the interval from T to T+scheduler_tick_interval.
// For example, if there are 100 rules that need to be evaluated at tick T, and the base interval is 10s, rules will be evaluated every 100ms.
// 3. It increases delay between rule updates and state reset.
// NOTE:
// 1. All alert rule intervals should be times of this interval. Otherwise, the rules will not be evaluated. It is not recommended to set it lower than 10s or odd numbers. Recommended: 10s, 30s, 1m
// 2. The increasing of the interval will affect how slow alert rule updates will reset the state, and therefore reset notification. Higher the interval - slower propagation of the changes.
baseInterval , err := gtime . ParseDuration ( valueAsString ( ua , "scheduler_tick_interval" , SchedulerBaseInterval . String ( ) ) )
if cfg . IsFeatureToggleEnabled ( "configurableSchedulerTick" ) { // use literal to avoid cycle imports
if err != nil {
return fmt . Errorf ( "failed to parse setting 'scheduler_tick_interval' as duration: %w" , err )
}
if baseInterval != SchedulerBaseInterval {
cfg . Logger . Warn ( "Scheduler tick interval is changed to non-default" , "interval" , baseInterval , "default" , SchedulerBaseInterval )
}
uaCfg . BaseInterval = baseInterval
} else if baseInterval != SchedulerBaseInterval {
cfg . Logger . Warn ( "Scheduler tick interval is changed to non-default but the feature flag is not enabled. Using default." , "interval" , baseInterval , "default" , SchedulerBaseInterval )
}
2022-02-12 05:13:49 +08:00
uaMinInterval , err := gtime . ParseDuration ( valueAsString ( ua , "min_interval" , uaCfg . BaseInterval . String ( ) ) )
if err != nil || uaMinInterval == uaCfg . BaseInterval { // unified option is invalid duration or equals the default
2021-09-28 18:00:16 +08:00
// if the legacy option is invalid, fallback to 10 (unified alerting min interval default)
2022-02-12 05:13:49 +08:00
legacyMinInterval := time . Duration ( alerting . Key ( "min_interval_seconds" ) . MustInt64 ( int64 ( uaCfg . BaseInterval . Seconds ( ) ) ) ) * time . Second
if legacyMinInterval > uaCfg . BaseInterval {
2021-09-28 18:00:16 +08:00
cfg . Logger . Warn ( "falling back to legacy setting of 'min_interval_seconds'; please use the configuration option in the `unified_alerting` section if Grafana 8 alerts are enabled." )
2022-02-12 05:13:49 +08:00
uaMinInterval = legacyMinInterval
} else {
// if legacy interval is smaller than the base interval, adjust it to the base interval
uaMinInterval = uaCfg . BaseInterval
2021-09-28 18:00:16 +08:00
}
2022-02-12 05:13:49 +08:00
}
if uaMinInterval < uaCfg . BaseInterval {
return fmt . Errorf ( "value of setting 'min_interval' should be greater than the base interval (%v)" , uaCfg . BaseInterval )
}
if uaMinInterval % uaCfg . BaseInterval != 0 {
return fmt . Errorf ( "value of setting 'min_interval' should be times of base interval (%v)" , uaCfg . BaseInterval )
2021-09-28 18:00:16 +08:00
}
uaCfg . MinInterval = uaMinInterval
2022-02-18 23:05:06 +08:00
uaCfg . DefaultRuleEvaluationInterval = DefaultRuleEvaluationInterval
if uaMinInterval > uaCfg . DefaultRuleEvaluationInterval {
uaCfg . DefaultRuleEvaluationInterval = uaMinInterval
2022-02-12 05:13:49 +08:00
}
2024-02-13 22:29:03 +08:00
quotas := iniFile . Section ( "quota" )
uaCfg . RulesPerRuleGroupLimit = quotas . Key ( "alerting_rule_group_rules" ) . MustInt64 ( 100 )
2024-06-27 15:45:15 +08:00
uaCfg . EvaluationResultLimit = quotas . Key ( "alerting_rule_evaluation_results" ) . MustInt ( - 1 )
2024-02-13 22:29:03 +08:00
2023-09-05 22:24:35 +08:00
remoteAlertmanager := iniFile . Section ( "remote.alertmanager" )
uaCfgRemoteAM := RemoteAlertmanagerSettings {
URL : remoteAlertmanager . Key ( "url" ) . MustString ( "" ) ,
TenantID : remoteAlertmanager . Key ( "tenant" ) . MustString ( "" ) ,
Password : remoteAlertmanager . Key ( "password" ) . MustString ( "" ) ,
}
2023-12-21 22:26:31 +08:00
uaCfgRemoteAM . SyncInterval , err = gtime . ParseDuration ( valueAsString ( remoteAlertmanager , "sync_interval" , ( schedulerDefaultAdminConfigPollInterval ) . String ( ) ) )
if err != nil {
return err
}
2025-05-13 19:25:56 +08:00
uaCfgRemoteAM . Timeout , err = gtime . ParseDuration ( valueAsString ( remoteAlertmanager , "timeout" , ( remoteAlertmanagerDefaultTimeout ) . String ( ) ) )
if err != nil {
return err
}
2023-12-21 22:26:31 +08:00
2023-09-05 22:24:35 +08:00
uaCfg . RemoteAlertmanager = uaCfgRemoteAM
2022-05-22 22:33:49 +08:00
screenshots := iniFile . Section ( "unified_alerting.screenshots" )
uaCfgScreenshots := uaCfg . Screenshots
2022-06-08 11:04:51 +08:00
uaCfgScreenshots . Capture = screenshots . Key ( "capture" ) . MustBool ( screenshotsDefaultCapture )
2023-01-06 00:07:46 +08:00
captureTimeout := screenshots . Key ( "capture_timeout" ) . MustDuration ( screenshotsDefaultCaptureTimeout )
if captureTimeout > screenshotsMaxCaptureTimeout {
return fmt . Errorf ( "value of setting 'capture_timeout' cannot exceed %s" , screenshotsMaxCaptureTimeout )
}
uaCfgScreenshots . CaptureTimeout = captureTimeout
2022-05-22 22:33:49 +08:00
uaCfgScreenshots . MaxConcurrentScreenshots = screenshots . Key ( "max_concurrent_screenshots" ) . MustInt64 ( screenshotsDefaultMaxConcurrent )
uaCfgScreenshots . UploadExternalImageStorage = screenshots . Key ( "upload_external_image_storage" ) . MustBool ( screenshotsDefaultUploadImageStorage )
uaCfg . Screenshots = uaCfgScreenshots
2022-07-12 00:41:40 +08:00
reservedLabels := iniFile . Section ( "unified_alerting.reserved_labels" )
uaCfgReservedLabels := UnifiedAlertingReservedLabelSettings {
DisabledLabels : make ( map [ string ] struct { } ) ,
}
for _ , label := range util . SplitString ( reservedLabels . Key ( "disabled_labels" ) . MustString ( "" ) ) {
uaCfgReservedLabels . DisabledLabels [ label ] = struct { } { }
}
uaCfg . ReservedLabels = uaCfgReservedLabels
2023-01-06 02:21:07 +08:00
stateHistory := iniFile . Section ( "unified_alerting.state_history" )
2023-01-31 04:24:45 +08:00
stateHistoryLabels := iniFile . Section ( "unified_alerting.state_history.external_labels" )
2023-01-06 02:21:07 +08:00
uaCfgStateHistory := UnifiedAlertingStateHistorySettings {
2025-07-17 21:26:26 +08:00
Enabled : stateHistory . Key ( "enabled" ) . MustBool ( stateHistoryDefaultEnabled ) ,
Backend : stateHistory . Key ( "backend" ) . MustString ( "annotations" ) ,
LokiSettings : UnifiedAlertingLokiSettings {
LokiRemoteURL : stateHistory . Key ( "loki_remote_url" ) . MustString ( "" ) ,
LokiReadURL : stateHistory . Key ( "loki_remote_read_url" ) . MustString ( "" ) ,
LokiWriteURL : stateHistory . Key ( "loki_remote_write_url" ) . MustString ( "" ) ,
LokiTenantID : stateHistory . Key ( "loki_tenant_id" ) . MustString ( "" ) ,
LokiBasicAuthUsername : stateHistory . Key ( "loki_basic_auth_username" ) . MustString ( "" ) ,
LokiBasicAuthPassword : stateHistory . Key ( "loki_basic_auth_password" ) . MustString ( "" ) ,
LokiMaxQueryLength : stateHistory . Key ( "loki_max_query_length" ) . MustDuration ( lokiDefaultMaxQueryLength ) ,
LokiMaxQuerySize : stateHistory . Key ( "loki_max_query_size" ) . MustInt ( lokiDefaultMaxQuerySize ) ,
} ,
2025-06-18 13:17:57 +08:00
MultiPrimary : stateHistory . Key ( "primary" ) . MustString ( "" ) ,
MultiSecondaries : splitTrim ( stateHistory . Key ( "secondaries" ) . MustString ( "" ) , "," ) ,
PrometheusMetricName : stateHistory . Key ( "prometheus_metric_name" ) . MustString ( defaultHistorianPrometheusMetricName ) ,
PrometheusTargetDatasourceUID : stateHistory . Key ( "prometheus_target_datasource_uid" ) . MustString ( "" ) ,
PrometheusWriteTimeout : stateHistory . Key ( "prometheus_write_timeout" ) . MustDuration ( defaultHistorianPrometheusWriteTimeout ) ,
ExternalLabels : stateHistoryLabels . KeysHash ( ) ,
2023-01-06 02:21:07 +08:00
}
uaCfg . StateHistory = uaCfgStateHistory
2025-07-17 21:26:26 +08:00
notificationHistory := iniFile . Section ( "unified_alerting.notification_history" )
notificationHistoryLabels := iniFile . Section ( "unified_alerting.notification_history.external_labels" )
uaCfgNotificationHistory := UnifiedAlertingNotificationHistorySettings {
Enabled : notificationHistory . Key ( "enabled" ) . MustBool ( notificationHistoryDefaultEnabled ) ,
LokiSettings : UnifiedAlertingLokiSettings {
LokiRemoteURL : notificationHistory . Key ( "loki_remote_url" ) . MustString ( "" ) ,
LokiReadURL : notificationHistory . Key ( "loki_remote_read_url" ) . MustString ( "" ) ,
LokiWriteURL : notificationHistory . Key ( "loki_remote_write_url" ) . MustString ( "" ) ,
LokiTenantID : notificationHistory . Key ( "loki_tenant_id" ) . MustString ( "" ) ,
LokiBasicAuthUsername : notificationHistory . Key ( "loki_basic_auth_username" ) . MustString ( "" ) ,
LokiBasicAuthPassword : notificationHistory . Key ( "loki_basic_auth_password" ) . MustString ( "" ) ,
LokiMaxQueryLength : notificationHistory . Key ( "loki_max_query_length" ) . MustDuration ( lokiDefaultMaxQueryLength ) ,
LokiMaxQuerySize : notificationHistory . Key ( "loki_max_query_size" ) . MustInt ( lokiDefaultMaxQuerySize ) ,
ExternalLabels : notificationHistoryLabels . KeysHash ( ) ,
} ,
}
uaCfg . NotificationHistory = uaCfgNotificationHistory
2025-03-20 22:31:21 +08:00
prometheusConversion := iniFile . Section ( "unified_alerting.prometheus_conversion" )
uaCfg . PrometheusConversion = UnifiedAlertingPrometheusConversionSettings {
RuleQueryOffset : prometheusConversion . Key ( "rule_query_offset" ) . MustDuration ( time . Minute ) ,
}
2024-06-13 04:04:46 +08:00
rr := iniFile . Section ( "recording_rules" )
uaCfgRecordingRules := RecordingRuleSettings {
2025-06-02 16:56:05 +08:00
Enabled : rr . Key ( "enabled" ) . MustBool ( true ) ,
2025-03-11 20:45:16 +08:00
Timeout : rr . Key ( "timeout" ) . MustDuration ( defaultRecordingRequestTimeout ) ,
DefaultDatasourceUID : rr . Key ( "default_datasource_uid" ) . MustString ( "" ) ,
2024-06-13 04:04:46 +08:00
}
rrHeaders := iniFile . Section ( "recording_rules.custom_headers" )
rrHeadersKeys := rrHeaders . Keys ( )
uaCfgRecordingRules . CustomHeaders = make ( map [ string ] string , len ( rrHeadersKeys ) )
for _ , key := range rrHeadersKeys {
uaCfgRecordingRules . CustomHeaders [ key . Name ( ) ] = key . Value ( )
}
uaCfg . RecordingRules = uaCfgRecordingRules
2023-06-23 18:36:07 +08:00
uaCfg . MaxStateSaveConcurrency = ua . Key ( "max_state_save_concurrency" ) . MustInt ( 1 )
2024-01-24 00:03:30 +08:00
uaCfg . StatePeriodicSaveInterval , err = gtime . ParseDuration ( valueAsString ( ua , "state_periodic_save_interval" , ( time . Minute * 5 ) . String ( ) ) )
if err != nil {
return err
}
2024-12-16 22:30:38 +08:00
uaCfg . StatePeriodicSaveBatchSize = ua . Key ( "state_periodic_save_batch_size" ) . MustInt ( 1 )
Alerting: Add jitter support for periodic alert state storage to reduce database load spikes (#111357)
What is this feature?
This PR implements a jitter mechanism for periodic alert state storage to distribute database load over time instead of processing all alert instances simultaneously. When enabled via the state_periodic_save_jitter_enabled configuration option, the system spreads batch write operations across 85% of the save interval window, preventing database load spikes in high-cardinality alerting environments.
Why do we need this feature?
In production environments with high alert cardinality, the current periodic batch storage can cause database performance issues by processing all alert instances simultaneously at fixed intervals. Even when using periodic batch storage to improve performance, concentrating all database operations at a single point in time can overwhelm database resources, especially in resource-constrained environments.
Rather than performing all INSERT operations at once during the periodic save, distributing these operations across the time window until the next save cycle can maintain more stable service operation within limited database resources. This approach prevents resource saturation by spreading the database load over the available time interval, allowing the system to operate more gracefully within existing resource constraints.
For example, with 200,000 alert instances using a 5-minute interval and 4,000 batch size, instead of executing 50 batch operations simultaneously, the jitter mechanism distributes these operations across approximately 4.25 minutes (85% of 5 minutes), with each batch executed roughly every 5.2 seconds.
This PR provides system-level protection against such load spikes by distributing operations across time, reducing peak resource usage while maintaining the benefits of periodic batch storage. The jitter mechanism is particularly valuable in resource-constrained environments where maintaining consistent database performance is more critical than precise timing of state updates.
2025-09-29 17:22:36 +08:00
uaCfg . StatePeriodicSaveJitterEnabled = ua . Key ( "state_periodic_save_jitter_enabled" ) . MustBool ( false )
2024-04-05 18:25:43 +08:00
uaCfg . NotificationLogRetention , err = gtime . ParseDuration ( valueAsString ( ua , "notification_log_retention" , ( 5 * 24 * time . Hour ) . String ( ) ) )
if err != nil {
return err
}
2024-06-21 04:33:03 +08:00
uaCfg . ResolvedAlertRetention , err = gtime . ParseDuration ( valueAsString ( ua , "resolved_alert_retention" , ( 15 * time . Minute ) . String ( ) ) )
if err != nil {
return err
}
2024-10-05 05:31:21 +08:00
uaCfg . RuleVersionRecordLimit = ua . Key ( "rule_version_record_limit" ) . MustInt ( 0 )
if uaCfg . RuleVersionRecordLimit < 0 {
return fmt . Errorf ( "setting 'rule_version_record_limit' is invalid, only 0 or a positive integer are allowed" )
}
2025-03-12 04:58:26 +08:00
uaCfg . DeletedRuleRetention = ua . Key ( "deleted_rule_retention" ) . MustDuration ( 30 * 24 * time . Hour )
if uaCfg . DeletedRuleRetention < 0 {
return fmt . Errorf ( "setting 'deleted_rule_retention' is invalid, only 0 or a positive duration are allowed" )
}
2021-09-20 15:12:21 +08:00
cfg . UnifiedAlerting = uaCfg
2021-09-16 22:33:51 +08:00
return nil
}
2021-09-28 18:00:16 +08:00
func GetAlertmanagerDefaultConfiguration ( ) string {
return alertmanagerDefaultConfiguration
}
2023-03-18 01:41:18 +08:00
func splitTrim ( s string , sep string ) [ ] string {
spl := strings . Split ( s , sep )
for i := range spl {
spl [ i ] = strings . TrimSpace ( spl [ i ] )
}
return spl
}