mirror of https://github.com/grafana/grafana.git
				
				
				
			Alerting: Add ha_reconnect_timeout configuration option (#88823)
* Docs: Update "Configure high availability" guide with ha_reconnect_timeout configuration --------- Co-authored-by: Christopher Moyer <35463610+chri2547@users.noreply.github.com>
This commit is contained in:
		
							parent
							
								
									2d370f3983
								
							
						
					
					
						commit
						eb76ea47a0
					
				|  | @ -1265,6 +1265,10 @@ ha_label = | |||
| # The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m. | ||||
| ha_gossip_interval = 200ms | ||||
| 
 | ||||
| # Length of time to attempt to reconnect to a lost peer. Recommended to be short (<15m) when Grafana is running in a Kubernetes cluster. | ||||
| # The string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m. | ||||
| ha_reconnect_timeout = 6h | ||||
| 
 | ||||
| # The interval between gossip full state syncs. Setting this interval lower (more frequent) will increase convergence speeds | ||||
| # across larger clusters at the expense of increased bandwidth usage. | ||||
| # The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m. | ||||
|  |  | |||
|  | @ -1251,6 +1251,10 @@ | |||
| # The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m. | ||||
| ;ha_gossip_interval = "200ms" | ||||
| 
 | ||||
| # Length of time to attempt to reconnect to a lost peer. Recommended to be short (<15m) when Grafana is running in a Kubernetes cluster. | ||||
| # The string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m. | ||||
| ;ha_reconnect_timeout = 6h | ||||
| 
 | ||||
| # The interval between gossip full state syncs. Setting this interval lower (more frequent) will increase convergence speeds | ||||
| # across larger clusters at the expense of increased bandwidth usage. | ||||
| # The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m. | ||||
|  |  | |||
|  | @ -147,4 +147,5 @@ The following metrics can be used for meta monitoring, exposed by the `/metrics` | |||
|    ha_peers = "grafana-alerting.grafana:9094" | ||||
|    ha_advertise_address = "${POD_IP}:9094" | ||||
|    ha_peer_timeout = 15s | ||||
|    ha_reconnect_timeout = 2m | ||||
|    ``` | ||||
|  |  | |||
|  | @ -1635,6 +1635,12 @@ across cluster more quickly at the expense of increased bandwidth usage. The def | |||
| 
 | ||||
| The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m. | ||||
| 
 | ||||
| ### ha_reconnect_timeout | ||||
| 
 | ||||
| Length of time to attempt to reconnect to a lost peer. When running Grafana in a Kubernetes cluster, set this duration to less than `15m`. | ||||
| 
 | ||||
| The string is a possibly signed sequence of decimal numbers followed by a unit suffix (ms, s, m, h, d), such as `30s` or `1m`. | ||||
| 
 | ||||
| ### ha_push_pull_interval | ||||
| 
 | ||||
| The interval between gossip full state syncs. Setting this interval lower (more frequent) will increase convergence speeds | ||||
|  |  | |||
|  | @ -215,7 +215,7 @@ func (moa *MultiOrgAlertmanager) setupClustering(cfg *setting.Cfg) error { | |||
| 			return fmt.Errorf("unable to initialize gossip mesh: %w", err) | ||||
| 		} | ||||
| 
 | ||||
| 		err = peer.Join(alertingCluster.DefaultReconnectInterval, alertingCluster.DefaultReconnectTimeout) | ||||
| 		err = peer.Join(alertingCluster.DefaultReconnectInterval, cfg.UnifiedAlerting.HAReconnectTimeout) | ||||
| 		if err != nil { | ||||
| 			moa.logger.Error("Msg", "Unable to join gossip mesh while initializing cluster for high availability mode", "error", err) | ||||
| 		} | ||||
|  |  | |||
|  | @ -18,6 +18,7 @@ const ( | |||
| 	alertmanagerDefaultClusterAddr        = "0.0.0.0:9094" | ||||
| 	alertmanagerDefaultPeerTimeout        = 15 * time.Second | ||||
| 	alertmanagerDefaultGossipInterval     = alertingCluster.DefaultGossipInterval | ||||
| 	alertmanagerDefaultReconnectTimeout   = alertingCluster.DefaultReconnectTimeout | ||||
| 	alertmanagerDefaultPushPullInterval   = alertingCluster.DefaultPushPullInterval | ||||
| 	alertmanagerDefaultConfigPollInterval = time.Minute | ||||
| 	alertmanagerRedisDefaultMaxConns      = 5 | ||||
|  | @ -71,6 +72,7 @@ type UnifiedAlertingSettings struct { | |||
| 	HAPeers                        []string | ||||
| 	HAPeerTimeout                  time.Duration | ||||
| 	HAGossipInterval               time.Duration | ||||
| 	HAReconnectTimeout             time.Duration | ||||
| 	HAPushPullInterval             time.Duration | ||||
| 	HALabel                        string | ||||
| 	HARedisClusterModeEnabled      bool | ||||
|  | @ -217,6 +219,10 @@ func (cfg *Cfg) ReadUnifiedAlertingSettings(iniFile *ini.File) error { | |||
| 	if err != nil { | ||||
| 		return err | ||||
| 	} | ||||
| 	uaCfg.HAReconnectTimeout, err = gtime.ParseDuration(valueAsString(ua, "ha_reconnect_timeout", (alertmanagerDefaultReconnectTimeout).String())) | ||||
| 	if err != nil { | ||||
| 		return err | ||||
| 	} | ||||
| 	uaCfg.HAPushPullInterval, err = gtime.ParseDuration(valueAsString(ua, "ha_push_pull_interval", (alertmanagerDefaultPushPullInterval).String())) | ||||
| 	if err != nil { | ||||
| 		return err | ||||
|  |  | |||
|  | @ -25,6 +25,7 @@ func TestCfg_ReadUnifiedAlertingSettings(t *testing.T) { | |||
| 		require.Len(t, cfg.UnifiedAlerting.HAPeers, 0) | ||||
| 		require.Equal(t, 200*time.Millisecond, cfg.UnifiedAlerting.HAGossipInterval) | ||||
| 		require.Equal(t, time.Minute, cfg.UnifiedAlerting.HAPushPullInterval) | ||||
| 		require.Equal(t, 6*time.Hour, cfg.UnifiedAlerting.HAReconnectTimeout) | ||||
| 	} | ||||
| 
 | ||||
| 	// With peers set, it correctly parses them.
 | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue