diff --git a/conf/defaults.ini b/conf/defaults.ini index 20449cc273a..966edbf4afc 100644 --- a/conf/defaults.ini +++ b/conf/defaults.ini @@ -1265,6 +1265,10 @@ ha_label = # The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m. ha_gossip_interval = 200ms +# Length of time to attempt to reconnect to a lost peer. Recommended to be short (<15m) when Grafana is running in a Kubernetes cluster. +# The string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m. +ha_reconnect_timeout = 6h + # The interval between gossip full state syncs. Setting this interval lower (more frequent) will increase convergence speeds # across larger clusters at the expense of increased bandwidth usage. # The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m. diff --git a/conf/sample.ini b/conf/sample.ini index 23478c1ad97..40a3279a378 100644 --- a/conf/sample.ini +++ b/conf/sample.ini @@ -1251,6 +1251,10 @@ # The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m. ;ha_gossip_interval = "200ms" +# Length of time to attempt to reconnect to a lost peer. Recommended to be short (<15m) when Grafana is running in a Kubernetes cluster. +# The string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m. +;ha_reconnect_timeout = 6h + # The interval between gossip full state syncs. Setting this interval lower (more frequent) will increase convergence speeds # across larger clusters at the expense of increased bandwidth usage. # The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m. diff --git a/docs/sources/alerting/set-up/configure-high-availability/_index.md b/docs/sources/alerting/set-up/configure-high-availability/_index.md index 539b4c1fc9b..a706223479e 100644 --- a/docs/sources/alerting/set-up/configure-high-availability/_index.md +++ b/docs/sources/alerting/set-up/configure-high-availability/_index.md @@ -147,4 +147,5 @@ The following metrics can be used for meta monitoring, exposed by the `/metrics` ha_peers = "grafana-alerting.grafana:9094" ha_advertise_address = "${POD_IP}:9094" ha_peer_timeout = 15s + ha_reconnect_timeout = 2m ``` diff --git a/docs/sources/setup-grafana/configure-grafana/_index.md b/docs/sources/setup-grafana/configure-grafana/_index.md index 64c42453589..1527fd91830 100644 --- a/docs/sources/setup-grafana/configure-grafana/_index.md +++ b/docs/sources/setup-grafana/configure-grafana/_index.md @@ -1635,6 +1635,12 @@ across cluster more quickly at the expense of increased bandwidth usage. The def The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m. +### ha_reconnect_timeout + +Length of time to attempt to reconnect to a lost peer. When running Grafana in a Kubernetes cluster, set this duration to less than `15m`. + +The string is a possibly signed sequence of decimal numbers followed by a unit suffix (ms, s, m, h, d), such as `30s` or `1m`. + ### ha_push_pull_interval The interval between gossip full state syncs. Setting this interval lower (more frequent) will increase convergence speeds diff --git a/pkg/services/ngalert/notifier/multiorg_alertmanager.go b/pkg/services/ngalert/notifier/multiorg_alertmanager.go index 4f70a3b57d9..4dbc1ad8602 100644 --- a/pkg/services/ngalert/notifier/multiorg_alertmanager.go +++ b/pkg/services/ngalert/notifier/multiorg_alertmanager.go @@ -215,7 +215,7 @@ func (moa *MultiOrgAlertmanager) setupClustering(cfg *setting.Cfg) error { return fmt.Errorf("unable to initialize gossip mesh: %w", err) } - err = peer.Join(alertingCluster.DefaultReconnectInterval, alertingCluster.DefaultReconnectTimeout) + err = peer.Join(alertingCluster.DefaultReconnectInterval, cfg.UnifiedAlerting.HAReconnectTimeout) if err != nil { moa.logger.Error("Msg", "Unable to join gossip mesh while initializing cluster for high availability mode", "error", err) } diff --git a/pkg/setting/setting_unified_alerting.go b/pkg/setting/setting_unified_alerting.go index 228fb049dc1..6536bd442f8 100644 --- a/pkg/setting/setting_unified_alerting.go +++ b/pkg/setting/setting_unified_alerting.go @@ -18,6 +18,7 @@ const ( alertmanagerDefaultClusterAddr = "0.0.0.0:9094" alertmanagerDefaultPeerTimeout = 15 * time.Second alertmanagerDefaultGossipInterval = alertingCluster.DefaultGossipInterval + alertmanagerDefaultReconnectTimeout = alertingCluster.DefaultReconnectTimeout alertmanagerDefaultPushPullInterval = alertingCluster.DefaultPushPullInterval alertmanagerDefaultConfigPollInterval = time.Minute alertmanagerRedisDefaultMaxConns = 5 @@ -71,6 +72,7 @@ type UnifiedAlertingSettings struct { HAPeers []string HAPeerTimeout time.Duration HAGossipInterval time.Duration + HAReconnectTimeout time.Duration HAPushPullInterval time.Duration HALabel string HARedisClusterModeEnabled bool @@ -217,6 +219,10 @@ func (cfg *Cfg) ReadUnifiedAlertingSettings(iniFile *ini.File) error { if err != nil { return err } + uaCfg.HAReconnectTimeout, err = gtime.ParseDuration(valueAsString(ua, "ha_reconnect_timeout", (alertmanagerDefaultReconnectTimeout).String())) + if err != nil { + return err + } uaCfg.HAPushPullInterval, err = gtime.ParseDuration(valueAsString(ua, "ha_push_pull_interval", (alertmanagerDefaultPushPullInterval).String())) if err != nil { return err diff --git a/pkg/setting/setting_unified_alerting_test.go b/pkg/setting/setting_unified_alerting_test.go index 4f4a540b401..e60dedf8fb6 100644 --- a/pkg/setting/setting_unified_alerting_test.go +++ b/pkg/setting/setting_unified_alerting_test.go @@ -25,6 +25,7 @@ func TestCfg_ReadUnifiedAlertingSettings(t *testing.T) { require.Len(t, cfg.UnifiedAlerting.HAPeers, 0) require.Equal(t, 200*time.Millisecond, cfg.UnifiedAlerting.HAGossipInterval) require.Equal(t, time.Minute, cfg.UnifiedAlerting.HAPushPullInterval) + require.Equal(t, 6*time.Hour, cfg.UnifiedAlerting.HAReconnectTimeout) } // With peers set, it correctly parses them.