mirror of https://github.com/grafana/grafana.git
				
				
				
			Alerting: Add ha_reconnect_timeout configuration option (#88823)
* Docs: Update "Configure high availability" guide with ha_reconnect_timeout configuration --------- Co-authored-by: Christopher Moyer <35463610+chri2547@users.noreply.github.com>
This commit is contained in:
		
							parent
							
								
									2d370f3983
								
							
						
					
					
						commit
						eb76ea47a0
					
				|  | @ -1265,6 +1265,10 @@ ha_label = | ||||||
| # The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m. | # The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m. | ||||||
| ha_gossip_interval = 200ms | ha_gossip_interval = 200ms | ||||||
| 
 | 
 | ||||||
|  | # Length of time to attempt to reconnect to a lost peer. Recommended to be short (<15m) when Grafana is running in a Kubernetes cluster. | ||||||
|  | # The string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m. | ||||||
|  | ha_reconnect_timeout = 6h | ||||||
|  | 
 | ||||||
| # The interval between gossip full state syncs. Setting this interval lower (more frequent) will increase convergence speeds | # The interval between gossip full state syncs. Setting this interval lower (more frequent) will increase convergence speeds | ||||||
| # across larger clusters at the expense of increased bandwidth usage. | # across larger clusters at the expense of increased bandwidth usage. | ||||||
| # The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m. | # The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m. | ||||||
|  |  | ||||||
|  | @ -1251,6 +1251,10 @@ | ||||||
| # The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m. | # The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m. | ||||||
| ;ha_gossip_interval = "200ms" | ;ha_gossip_interval = "200ms" | ||||||
| 
 | 
 | ||||||
|  | # Length of time to attempt to reconnect to a lost peer. Recommended to be short (<15m) when Grafana is running in a Kubernetes cluster. | ||||||
|  | # The string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m. | ||||||
|  | ;ha_reconnect_timeout = 6h | ||||||
|  | 
 | ||||||
| # The interval between gossip full state syncs. Setting this interval lower (more frequent) will increase convergence speeds | # The interval between gossip full state syncs. Setting this interval lower (more frequent) will increase convergence speeds | ||||||
| # across larger clusters at the expense of increased bandwidth usage. | # across larger clusters at the expense of increased bandwidth usage. | ||||||
| # The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m. | # The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m. | ||||||
|  |  | ||||||
|  | @ -147,4 +147,5 @@ The following metrics can be used for meta monitoring, exposed by the `/metrics` | ||||||
|    ha_peers = "grafana-alerting.grafana:9094" |    ha_peers = "grafana-alerting.grafana:9094" | ||||||
|    ha_advertise_address = "${POD_IP}:9094" |    ha_advertise_address = "${POD_IP}:9094" | ||||||
|    ha_peer_timeout = 15s |    ha_peer_timeout = 15s | ||||||
|  |    ha_reconnect_timeout = 2m | ||||||
|    ``` |    ``` | ||||||
|  |  | ||||||
|  | @ -1635,6 +1635,12 @@ across cluster more quickly at the expense of increased bandwidth usage. The def | ||||||
| 
 | 
 | ||||||
| The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m. | The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m. | ||||||
| 
 | 
 | ||||||
|  | ### ha_reconnect_timeout | ||||||
|  | 
 | ||||||
|  | Length of time to attempt to reconnect to a lost peer. When running Grafana in a Kubernetes cluster, set this duration to less than `15m`. | ||||||
|  | 
 | ||||||
|  | The string is a possibly signed sequence of decimal numbers followed by a unit suffix (ms, s, m, h, d), such as `30s` or `1m`. | ||||||
|  | 
 | ||||||
| ### ha_push_pull_interval | ### ha_push_pull_interval | ||||||
| 
 | 
 | ||||||
| The interval between gossip full state syncs. Setting this interval lower (more frequent) will increase convergence speeds | The interval between gossip full state syncs. Setting this interval lower (more frequent) will increase convergence speeds | ||||||
|  |  | ||||||
|  | @ -215,7 +215,7 @@ func (moa *MultiOrgAlertmanager) setupClustering(cfg *setting.Cfg) error { | ||||||
| 			return fmt.Errorf("unable to initialize gossip mesh: %w", err) | 			return fmt.Errorf("unable to initialize gossip mesh: %w", err) | ||||||
| 		} | 		} | ||||||
| 
 | 
 | ||||||
| 		err = peer.Join(alertingCluster.DefaultReconnectInterval, alertingCluster.DefaultReconnectTimeout) | 		err = peer.Join(alertingCluster.DefaultReconnectInterval, cfg.UnifiedAlerting.HAReconnectTimeout) | ||||||
| 		if err != nil { | 		if err != nil { | ||||||
| 			moa.logger.Error("Msg", "Unable to join gossip mesh while initializing cluster for high availability mode", "error", err) | 			moa.logger.Error("Msg", "Unable to join gossip mesh while initializing cluster for high availability mode", "error", err) | ||||||
| 		} | 		} | ||||||
|  |  | ||||||
|  | @ -18,6 +18,7 @@ const ( | ||||||
| 	alertmanagerDefaultClusterAddr        = "0.0.0.0:9094" | 	alertmanagerDefaultClusterAddr        = "0.0.0.0:9094" | ||||||
| 	alertmanagerDefaultPeerTimeout        = 15 * time.Second | 	alertmanagerDefaultPeerTimeout        = 15 * time.Second | ||||||
| 	alertmanagerDefaultGossipInterval     = alertingCluster.DefaultGossipInterval | 	alertmanagerDefaultGossipInterval     = alertingCluster.DefaultGossipInterval | ||||||
|  | 	alertmanagerDefaultReconnectTimeout   = alertingCluster.DefaultReconnectTimeout | ||||||
| 	alertmanagerDefaultPushPullInterval   = alertingCluster.DefaultPushPullInterval | 	alertmanagerDefaultPushPullInterval   = alertingCluster.DefaultPushPullInterval | ||||||
| 	alertmanagerDefaultConfigPollInterval = time.Minute | 	alertmanagerDefaultConfigPollInterval = time.Minute | ||||||
| 	alertmanagerRedisDefaultMaxConns      = 5 | 	alertmanagerRedisDefaultMaxConns      = 5 | ||||||
|  | @ -71,6 +72,7 @@ type UnifiedAlertingSettings struct { | ||||||
| 	HAPeers                        []string | 	HAPeers                        []string | ||||||
| 	HAPeerTimeout                  time.Duration | 	HAPeerTimeout                  time.Duration | ||||||
| 	HAGossipInterval               time.Duration | 	HAGossipInterval               time.Duration | ||||||
|  | 	HAReconnectTimeout             time.Duration | ||||||
| 	HAPushPullInterval             time.Duration | 	HAPushPullInterval             time.Duration | ||||||
| 	HALabel                        string | 	HALabel                        string | ||||||
| 	HARedisClusterModeEnabled      bool | 	HARedisClusterModeEnabled      bool | ||||||
|  | @ -217,6 +219,10 @@ func (cfg *Cfg) ReadUnifiedAlertingSettings(iniFile *ini.File) error { | ||||||
| 	if err != nil { | 	if err != nil { | ||||||
| 		return err | 		return err | ||||||
| 	} | 	} | ||||||
|  | 	uaCfg.HAReconnectTimeout, err = gtime.ParseDuration(valueAsString(ua, "ha_reconnect_timeout", (alertmanagerDefaultReconnectTimeout).String())) | ||||||
|  | 	if err != nil { | ||||||
|  | 		return err | ||||||
|  | 	} | ||||||
| 	uaCfg.HAPushPullInterval, err = gtime.ParseDuration(valueAsString(ua, "ha_push_pull_interval", (alertmanagerDefaultPushPullInterval).String())) | 	uaCfg.HAPushPullInterval, err = gtime.ParseDuration(valueAsString(ua, "ha_push_pull_interval", (alertmanagerDefaultPushPullInterval).String())) | ||||||
| 	if err != nil { | 	if err != nil { | ||||||
| 		return err | 		return err | ||||||
|  |  | ||||||
|  | @ -25,6 +25,7 @@ func TestCfg_ReadUnifiedAlertingSettings(t *testing.T) { | ||||||
| 		require.Len(t, cfg.UnifiedAlerting.HAPeers, 0) | 		require.Len(t, cfg.UnifiedAlerting.HAPeers, 0) | ||||||
| 		require.Equal(t, 200*time.Millisecond, cfg.UnifiedAlerting.HAGossipInterval) | 		require.Equal(t, 200*time.Millisecond, cfg.UnifiedAlerting.HAGossipInterval) | ||||||
| 		require.Equal(t, time.Minute, cfg.UnifiedAlerting.HAPushPullInterval) | 		require.Equal(t, time.Minute, cfg.UnifiedAlerting.HAPushPullInterval) | ||||||
|  | 		require.Equal(t, 6*time.Hour, cfg.UnifiedAlerting.HAReconnectTimeout) | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	// With peers set, it correctly parses them.
 | 	// With peers set, it correctly parses them.
 | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue