chore(notifier): Split 'Run()' into two goroutines: one to receive target updates and trigger reloads and the other one to send notifications.
This is done to prevent the latter operation from blocking/starving the former, as previously, the `tsets` channel was consumed by the same goroutine that consumes and feeds the buffered `n.more` channel, the `tsets` channel was less likely to be ready as it's unbuffered and only fed every `SDManager.updatert` seconds.
See https://github.com/prometheus/prometheus/issues/13676 and https://github.com/prometheus/prometheus/issues/8768
The synchronization with the sendLoop goroutine is managed through the n.mtx mutex.
This uses a similar approach than scrape manager's efbd6e41c5/scrape/manager.go (L115-L117)
The old TestHangingNotifier was replaced by the new one to more closely reflect reality.
Signed-off-by: machine424 <ayoubmrini424@gmail.com>
			
			
This commit is contained in:
		
							parent
							
								
									94d28cd6cf
								
							
						
					
					
						commit
						690de487e2
					
				|  | @ -298,26 +298,15 @@ func (n *Manager) nextBatch() []*Alert { | ||||||
| 	return alerts | 	return alerts | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| // Run dispatches notifications continuously.
 | // sendLoop continuously consumes the notifications queue and sends alerts to
 | ||||||
| func (n *Manager) Run(tsets <-chan map[string][]*targetgroup.Group) { | // the configured Alertmanagers.
 | ||||||
|  | func (n *Manager) sendLoop() { | ||||||
| 	for { | 	for { | ||||||
| 		// The select is split in two parts, such as we will first try to read
 |  | ||||||
| 		// new alertmanager targets if they are available, before sending new
 |  | ||||||
| 		// alerts.
 |  | ||||||
| 		select { | 		select { | ||||||
| 		case <-n.ctx.Done(): | 		case <-n.ctx.Done(): | ||||||
| 			return | 			return | ||||||
| 		case ts := <-tsets: |  | ||||||
| 			n.reload(ts) |  | ||||||
| 		default: |  | ||||||
| 			select { |  | ||||||
| 			case <-n.ctx.Done(): |  | ||||||
| 				return |  | ||||||
| 			case ts := <-tsets: |  | ||||||
| 				n.reload(ts) |  | ||||||
| 		case <-n.more: | 		case <-n.more: | ||||||
| 		} | 		} | ||||||
| 		} |  | ||||||
| 		alerts := n.nextBatch() | 		alerts := n.nextBatch() | ||||||
| 
 | 
 | ||||||
| 		if !n.sendAll(alerts...) { | 		if !n.sendAll(alerts...) { | ||||||
|  | @ -330,6 +319,21 @@ func (n *Manager) Run(tsets <-chan map[string][]*targetgroup.Group) { | ||||||
| 	} | 	} | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | // Run receives updates of target groups and triggers a reload.
 | ||||||
|  | // The dispatching of notifications occurs in the background to prevent blocking the receipt of target updates.
 | ||||||
|  | // Refer to https://github.com/prometheus/prometheus/issues/13676 for more details.
 | ||||||
|  | func (n *Manager) Run(tsets <-chan map[string][]*targetgroup.Group) { | ||||||
|  | 	go n.sendLoop() | ||||||
|  | 	for { | ||||||
|  | 		select { | ||||||
|  | 		case <-n.ctx.Done(): | ||||||
|  | 			return | ||||||
|  | 		case ts := <-tsets: | ||||||
|  | 			n.reload(ts) | ||||||
|  | 		} | ||||||
|  | 	} | ||||||
|  | } | ||||||
|  | 
 | ||||||
| func (n *Manager) reload(tgs map[string][]*targetgroup.Group) { | func (n *Manager) reload(tgs map[string][]*targetgroup.Group) { | ||||||
| 	n.mtx.Lock() | 	n.mtx.Lock() | ||||||
| 	defer n.mtx.Unlock() | 	defer n.mtx.Unlock() | ||||||
|  | @ -483,6 +487,7 @@ func (n *Manager) sendAll(alerts ...*Alert) bool { | ||||||
| 
 | 
 | ||||||
| 		ams.mtx.RLock() | 		ams.mtx.RLock() | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
| 		if len(ams.cfg.AlertRelabelConfigs) > 0 { | 		if len(ams.cfg.AlertRelabelConfigs) > 0 { | ||||||
| 			amAlerts = relabelAlerts(ams.cfg.AlertRelabelConfigs, labels.Labels{}, alerts) | 			amAlerts = relabelAlerts(ams.cfg.AlertRelabelConfigs, labels.Labels{}, alerts) | ||||||
| 			if len(amAlerts) == 0 { | 			if len(amAlerts) == 0 { | ||||||
|  |  | ||||||
|  | @ -701,125 +701,10 @@ func TestLabelsToOpenAPILabelSet(t *testing.T) { | ||||||
| 	require.Equal(t, models.LabelSet{"aaa": "111", "bbb": "222"}, labelsToOpenAPILabelSet(labels.FromStrings("aaa", "111", "bbb", "222"))) | 	require.Equal(t, models.LabelSet{"aaa": "111", "bbb": "222"}, labelsToOpenAPILabelSet(labels.FromStrings("aaa", "111", "bbb", "222"))) | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| // TestHangingNotifier validates that targets updates happen even when there are
 | // TestHangingNotifier ensures that the notifier takes into account SD changes even when there are
 | ||||||
| // queued alerts.
 |  | ||||||
| func TestHangingNotifier(t *testing.T) { |  | ||||||
| 	// Note: When targets are not updated in time, this test is flaky because go
 |  | ||||||
| 	// selects are not deterministic. Therefore we run 10 subtests to run into the issue.
 |  | ||||||
| 	for i := 0; i < 10; i++ { |  | ||||||
| 		t.Run(strconv.Itoa(i), func(t *testing.T) { |  | ||||||
| 			var ( |  | ||||||
| 				done    = make(chan struct{}) |  | ||||||
| 				changed = make(chan struct{}) |  | ||||||
| 				syncCh  = make(chan map[string][]*targetgroup.Group) |  | ||||||
| 			) |  | ||||||
| 
 |  | ||||||
| 			defer func() { |  | ||||||
| 				close(done) |  | ||||||
| 			}() |  | ||||||
| 
 |  | ||||||
| 			var calledOnce bool |  | ||||||
| 			// Setting up a bad server. This server hangs for 2 seconds.
 |  | ||||||
| 			badServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { |  | ||||||
| 				if calledOnce { |  | ||||||
| 					t.Fatal("hanging server called multiple times") |  | ||||||
| 				} |  | ||||||
| 				calledOnce = true |  | ||||||
| 				select { |  | ||||||
| 				case <-done: |  | ||||||
| 				case <-time.After(2 * time.Second): |  | ||||||
| 				} |  | ||||||
| 			})) |  | ||||||
| 			badURL, err := url.Parse(badServer.URL) |  | ||||||
| 			require.NoError(t, err) |  | ||||||
| 			badAddress := badURL.Host // Used for __name__ label in targets.
 |  | ||||||
| 
 |  | ||||||
| 			// Setting up a bad server. This server returns fast, signaling requests on
 |  | ||||||
| 			// by closing the changed channel.
 |  | ||||||
| 			goodServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { |  | ||||||
| 				close(changed) |  | ||||||
| 			})) |  | ||||||
| 			goodURL, err := url.Parse(goodServer.URL) |  | ||||||
| 			require.NoError(t, err) |  | ||||||
| 			goodAddress := goodURL.Host // Used for __name__ label in targets.
 |  | ||||||
| 
 |  | ||||||
| 			h := NewManager( |  | ||||||
| 				&Options{ |  | ||||||
| 					QueueCapacity: 20 * maxBatchSize, |  | ||||||
| 				}, |  | ||||||
| 				nil, |  | ||||||
| 			) |  | ||||||
| 
 |  | ||||||
| 			h.alertmanagers = make(map[string]*alertmanagerSet) |  | ||||||
| 
 |  | ||||||
| 			am1Cfg := config.DefaultAlertmanagerConfig |  | ||||||
| 			am1Cfg.Timeout = model.Duration(200 * time.Millisecond) |  | ||||||
| 
 |  | ||||||
| 			h.alertmanagers["config-0"] = &alertmanagerSet{ |  | ||||||
| 				ams:     []alertmanager{}, |  | ||||||
| 				cfg:     &am1Cfg, |  | ||||||
| 				metrics: h.metrics, |  | ||||||
| 			} |  | ||||||
| 			go h.Run(syncCh) |  | ||||||
| 			defer h.Stop() |  | ||||||
| 
 |  | ||||||
| 			var alerts []*Alert |  | ||||||
| 			for i := range make([]struct{}, 20*maxBatchSize) { |  | ||||||
| 				alerts = append(alerts, &Alert{ |  | ||||||
| 					Labels: labels.FromStrings("alertname", strconv.Itoa(i)), |  | ||||||
| 				}) |  | ||||||
| 			} |  | ||||||
| 
 |  | ||||||
| 			// Injecting the hanging server URL.
 |  | ||||||
| 			syncCh <- map[string][]*targetgroup.Group{ |  | ||||||
| 				"config-0": { |  | ||||||
| 					{ |  | ||||||
| 						Targets: []model.LabelSet{ |  | ||||||
| 							{ |  | ||||||
| 								model.AddressLabel: model.LabelValue(badAddress), |  | ||||||
| 							}, |  | ||||||
| 						}, |  | ||||||
| 					}, |  | ||||||
| 				}, |  | ||||||
| 			} |  | ||||||
| 
 |  | ||||||
| 			// Queing alerts.
 |  | ||||||
| 			h.Send(alerts...) |  | ||||||
| 
 |  | ||||||
| 			// Updating with a working alertmanager target.
 |  | ||||||
| 			go func() { |  | ||||||
| 				select { |  | ||||||
| 				case syncCh <- map[string][]*targetgroup.Group{ |  | ||||||
| 					"config-0": { |  | ||||||
| 						{ |  | ||||||
| 							Targets: []model.LabelSet{ |  | ||||||
| 								{ |  | ||||||
| 									model.AddressLabel: model.LabelValue(goodAddress), |  | ||||||
| 								}, |  | ||||||
| 							}, |  | ||||||
| 						}, |  | ||||||
| 					}, |  | ||||||
| 				}: |  | ||||||
| 				case <-done: |  | ||||||
| 				} |  | ||||||
| 			}() |  | ||||||
| 
 |  | ||||||
| 			select { |  | ||||||
| 			case <-time.After(1 * time.Second): |  | ||||||
| 				t.Fatalf("Timeout after 1 second, targets not synced in time.") |  | ||||||
| 			case <-changed: |  | ||||||
| 				// The good server has been hit in less than 3 seconds, therefore
 |  | ||||||
| 				// targets have been updated before a second call could be made to the
 |  | ||||||
| 				// bad server.
 |  | ||||||
| 			} |  | ||||||
| 		}) |  | ||||||
| 	} |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| // TODO: renameit and even replace TestHangingNotifier with it.
 |  | ||||||
| // TestHangingNotifierXXX ensures that the notifier takes into account SD changes even when there are
 |  | ||||||
| // queued alerts. This test reproduces the issue described in https://github.com/prometheus/prometheus/issues/13676.
 | // queued alerts. This test reproduces the issue described in https://github.com/prometheus/prometheus/issues/13676.
 | ||||||
| func TestHangingNotifierXXX(t *testing.T) { | // and https://github.com/prometheus/prometheus/issues/8768.
 | ||||||
|  | func TestHangingNotifier(t *testing.T) { | ||||||
| 	const ( | 	const ( | ||||||
| 		batches     = 100 | 		batches     = 100 | ||||||
| 		alertsCount = maxBatchSize * batches | 		alertsCount = maxBatchSize * batches | ||||||
|  | @ -857,6 +742,8 @@ func TestHangingNotifierXXX(t *testing.T) { | ||||||
| 	require.NoError(t, err) | 	require.NoError(t, err) | ||||||
| 
 | 
 | ||||||
| 	// Initialize the discovery manager
 | 	// Initialize the discovery manager
 | ||||||
|  | 	// This is relevant as the updates aren't sent continually in real life, but only each updatert.
 | ||||||
|  | 	// The old implementation of TestHangingNotifier didn't take that into acount.
 | ||||||
| 	ctx, cancel := context.WithCancel(context.Background()) | 	ctx, cancel := context.WithCancel(context.Background()) | ||||||
| 	defer cancel() | 	defer cancel() | ||||||
| 	reg := prometheus.NewRegistry() | 	reg := prometheus.NewRegistry() | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue