mirror of https://github.com/grafana/grafana.git
				
				
				
			Alerting: Improve operational logs in sender package (#57134)
* Audit logs in sender package * Fix casing and touch up a few key names * Avoid logging entire alert struct * Log configuration ID being applied * Revert change to errorf rather than log * Tune levels further and remove some redundancies * Adjust logger naming and standardize log context * Adjust logger naming in router * Move log and get rid of dead error handling code
This commit is contained in:
		
							parent
							
								
									f3c219a980
								
							
						
					
					
						commit
						5ee4744d62
					
				|  | @ -51,7 +51,7 @@ func NewAlertsRouter(multiOrgNotifier *notifier.MultiOrgAlertmanager, store stor | |||
| 	clk clock.Clock, appURL *url.URL, disabledOrgs map[int64]struct{}, configPollInterval time.Duration, | ||||
| 	datasourceService datasources.DataSourceService, secretService secrets.Service) *AlertsRouter { | ||||
| 	d := &AlertsRouter{ | ||||
| 		logger:           log.New("alerts-router"), | ||||
| 		logger:           log.New("ngalert.sender.router"), | ||||
| 		clock:            clk, | ||||
| 		adminConfigStore: store, | ||||
| 
 | ||||
|  | @ -75,20 +75,18 @@ func NewAlertsRouter(multiOrgNotifier *notifier.MultiOrgAlertmanager, store stor | |||
| // SyncAndApplyConfigFromDatabase looks for the admin configuration in the database
 | ||||
| // and adjusts the sender(s) and alert handling mechanism accordingly.
 | ||||
| func (d *AlertsRouter) SyncAndApplyConfigFromDatabase() error { | ||||
| 	d.logger.Debug("start of admin configuration sync") | ||||
| 	cfgs, err := d.adminConfigStore.GetAdminConfigurations() | ||||
| 	if err != nil { | ||||
| 		return err | ||||
| 	} | ||||
| 
 | ||||
| 	d.logger.Debug("found admin configurations", "count", len(cfgs)) | ||||
| 	d.logger.Debug("Attempting to sync admin configs", "count", len(cfgs)) | ||||
| 
 | ||||
| 	orgsFound := make(map[int64]struct{}, len(cfgs)) | ||||
| 	d.adminConfigMtx.Lock() | ||||
| 	for _, cfg := range cfgs { | ||||
| 		_, isDisabledOrg := d.disabledOrgs[cfg.OrgID] | ||||
| 		if isDisabledOrg { | ||||
| 			d.logger.Debug("skipping starting sender for disabled org", "org", cfg.OrgID) | ||||
| 			continue | ||||
| 		} | ||||
| 
 | ||||
|  | @ -101,13 +99,13 @@ func (d *AlertsRouter) SyncAndApplyConfigFromDatabase() error { | |||
| 
 | ||||
| 		//  We have no running sender and alerts are handled internally, no-op.
 | ||||
| 		if !ok && cfg.SendAlertsTo == models.InternalAlertmanager { | ||||
| 			d.logger.Debug("alerts are handled internally", "org", cfg.OrgID) | ||||
| 			d.logger.Debug("Grafana is configured to send alerts to the internal alertmanager only. Skipping synchronization with external alertmanager", "org", cfg.OrgID) | ||||
| 			continue | ||||
| 		} | ||||
| 
 | ||||
| 		externalAlertmanagers, err := d.alertmanagersFromDatasources(cfg.OrgID) | ||||
| 		if err != nil { | ||||
| 			d.logger.Error("failed to get alertmanagers from datasources", | ||||
| 			d.logger.Error("Failed to get alertmanagers from datasources", | ||||
| 				"org", cfg.OrgID, | ||||
| 				"error", err) | ||||
| 			continue | ||||
|  | @ -116,13 +114,13 @@ func (d *AlertsRouter) SyncAndApplyConfigFromDatabase() error { | |||
| 
 | ||||
| 		// We have no running sender and no Alertmanager(s) configured, no-op.
 | ||||
| 		if !ok && len(cfg.Alertmanagers) == 0 { | ||||
| 			d.logger.Debug("no external alertmanagers configured", "org", cfg.OrgID) | ||||
| 			d.logger.Debug("No external alertmanagers configured", "org", cfg.OrgID) | ||||
| 			continue | ||||
| 		} | ||||
| 
 | ||||
| 		// We have a running sender but no Alertmanager(s) configured, shut it down.
 | ||||
| 		if ok && len(cfg.Alertmanagers) == 0 { | ||||
| 			d.logger.Debug("no external alertmanager(s) configured, sender will be stopped", "org", cfg.OrgID) | ||||
| 			d.logger.Info("No external alertmanager(s) configured, sender will be stopped", "org", cfg.OrgID) | ||||
| 			delete(orgsFound, cfg.OrgID) | ||||
| 			continue | ||||
| 		} | ||||
|  | @ -132,7 +130,7 @@ func (d *AlertsRouter) SyncAndApplyConfigFromDatabase() error { | |||
| 		for _, am := range cfg.Alertmanagers { | ||||
| 			parsedAM, err := url.Parse(am) | ||||
| 			if err != nil { | ||||
| 				d.logger.Error("failed to parse alertmanager string", | ||||
| 				d.logger.Error("Failed to parse alertmanager string", | ||||
| 					"org", cfg.OrgID, | ||||
| 					"error", err) | ||||
| 				continue | ||||
|  | @ -140,20 +138,20 @@ func (d *AlertsRouter) SyncAndApplyConfigFromDatabase() error { | |||
| 			redactedAMs = append(redactedAMs, parsedAM.Redacted()) | ||||
| 		} | ||||
| 
 | ||||
| 		d.logger.Debug("alertmanagers found in the configuration", "alertmanagers", redactedAMs) | ||||
| 		d.logger.Debug("Alertmanagers found in the configuration", "alertmanagers", redactedAMs) | ||||
| 
 | ||||
| 		// We have a running sender, check if we need to apply a new config.
 | ||||
| 		amHash := cfg.AsSHA256() | ||||
| 		if ok { | ||||
| 			if d.externalAlertmanagersCfgHash[cfg.OrgID] == amHash { | ||||
| 				d.logger.Debug("sender configuration is the same as the one running, no-op", "org", cfg.OrgID, "alertmanagers", redactedAMs) | ||||
| 				d.logger.Debug("Sender configuration is the same as the one running, no-op", "org", cfg.OrgID, "alertmanagers", redactedAMs) | ||||
| 				continue | ||||
| 			} | ||||
| 
 | ||||
| 			d.logger.Debug("applying new configuration to sender", "org", cfg.OrgID, "alertmanagers", redactedAMs) | ||||
| 			d.logger.Info("Applying new configuration to sender", "org", cfg.OrgID, "alertmanagers", redactedAMs, "cfg", cfg.ID) | ||||
| 			err := existing.ApplyConfig(cfg) | ||||
| 			if err != nil { | ||||
| 				d.logger.Error("failed to apply configuration", "error", err, "org", cfg.OrgID) | ||||
| 				d.logger.Error("Failed to apply configuration", "error", err, "org", cfg.OrgID) | ||||
| 				continue | ||||
| 			} | ||||
| 			d.externalAlertmanagersCfgHash[cfg.OrgID] = amHash | ||||
|  | @ -161,19 +159,14 @@ func (d *AlertsRouter) SyncAndApplyConfigFromDatabase() error { | |||
| 		} | ||||
| 
 | ||||
| 		// No sender and have Alertmanager(s) to send to - start a new one.
 | ||||
| 		d.logger.Info("creating new sender for the external alertmanagers", "org", cfg.OrgID, "alertmanagers", redactedAMs) | ||||
| 		s, err := NewExternalAlertmanagerSender() | ||||
| 		if err != nil { | ||||
| 			d.logger.Error("unable to start the sender", "error", err, "org", cfg.OrgID) | ||||
| 			continue | ||||
| 		} | ||||
| 
 | ||||
| 		d.logger.Info("Creating new sender for the external alertmanagers", "org", cfg.OrgID, "alertmanagers", redactedAMs) | ||||
| 		s := NewExternalAlertmanagerSender() | ||||
| 		d.externalAlertmanagers[cfg.OrgID] = s | ||||
| 		s.Run() | ||||
| 
 | ||||
| 		err = s.ApplyConfig(cfg) | ||||
| 		if err != nil { | ||||
| 			d.logger.Error("failed to apply configuration", "error", err, "org", cfg.OrgID) | ||||
| 			d.logger.Error("Failed to apply configuration", "error", err, "org", cfg.OrgID) | ||||
| 			continue | ||||
| 		} | ||||
| 
 | ||||
|  | @ -193,12 +186,12 @@ func (d *AlertsRouter) SyncAndApplyConfigFromDatabase() error { | |||
| 
 | ||||
| 	// We can now stop these externalAlertmanagers w/o having to hold a lock.
 | ||||
| 	for orgID, s := range sendersToStop { | ||||
| 		d.logger.Info("stopping sender", "org", orgID) | ||||
| 		d.logger.Info("Stopping sender", "org", orgID) | ||||
| 		s.Stop() | ||||
| 		d.logger.Info("stopped sender", "org", orgID) | ||||
| 		d.logger.Info("Stopped sender", "org", orgID) | ||||
| 	} | ||||
| 
 | ||||
| 	d.logger.Debug("finish of admin configuration sync") | ||||
| 	d.logger.Debug("Finish of admin configuration sync") | ||||
| 
 | ||||
| 	return nil | ||||
| } | ||||
|  | @ -223,7 +216,7 @@ func (d *AlertsRouter) alertmanagersFromDatasources(orgID int64) ([]string, erro | |||
| 		} | ||||
| 		amURL, err := d.buildExternalURL(ds) | ||||
| 		if err != nil { | ||||
| 			d.logger.Error("failed to build external alertmanager URL", | ||||
| 			d.logger.Error("Failed to build external alertmanager URL", | ||||
| 				"org", ds.OrgId, | ||||
| 				"uid", ds.Uid, | ||||
| 				"error", err) | ||||
|  | @ -255,29 +248,29 @@ func (d *AlertsRouter) buildExternalURL(ds *datasources.DataSource) (string, err | |||
| } | ||||
| 
 | ||||
| func (d *AlertsRouter) Send(key models.AlertRuleKey, alerts definitions.PostableAlerts) { | ||||
| 	logger := d.logger.New("rule_uid", key.UID, "org", key.OrgID) | ||||
| 	logger := d.logger.New(key.LogContext()...) | ||||
| 	if len(alerts.PostableAlerts) == 0 { | ||||
| 		logger.Debug("no alerts to notify about") | ||||
| 		logger.Info("No alerts to notify about") | ||||
| 		return | ||||
| 	} | ||||
| 	// Send alerts to local notifier if they need to be handled internally
 | ||||
| 	// or if no external AMs have been discovered yet.
 | ||||
| 	var localNotifierExist, externalNotifierExist bool | ||||
| 	if d.sendAlertsTo[key.OrgID] == models.ExternalAlertmanagers && len(d.AlertmanagersFor(key.OrgID)) > 0 { | ||||
| 		logger.Debug("no alerts to put in the notifier") | ||||
| 		logger.Debug("All alerts for the given org should be routed to external notifiers only. skipping the internal notifier.") | ||||
| 	} else { | ||||
| 		logger.Debug("sending alerts to local notifier", "count", len(alerts.PostableAlerts), "alerts", alerts.PostableAlerts) | ||||
| 		logger.Info("Sending alerts to local notifier", "count", len(alerts.PostableAlerts)) | ||||
| 		n, err := d.multiOrgNotifier.AlertmanagerFor(key.OrgID) | ||||
| 		if err == nil { | ||||
| 			localNotifierExist = true | ||||
| 			if err := n.PutAlerts(alerts); err != nil { | ||||
| 				logger.Error("failed to put alerts in the local notifier", "count", len(alerts.PostableAlerts), "error", err) | ||||
| 				logger.Error("Failed to put alerts in the local notifier", "count", len(alerts.PostableAlerts), "error", err) | ||||
| 			} | ||||
| 		} else { | ||||
| 			if errors.Is(err, notifier.ErrNoAlertmanagerForOrg) { | ||||
| 				logger.Debug("local notifier was not found") | ||||
| 				logger.Debug("Local notifier was not found") | ||||
| 			} else { | ||||
| 				logger.Error("local notifier is not available", "error", err) | ||||
| 				logger.Error("Local notifier is not available", "error", err) | ||||
| 			} | ||||
| 		} | ||||
| 	} | ||||
|  | @ -288,13 +281,13 @@ func (d *AlertsRouter) Send(key models.AlertRuleKey, alerts definitions.Postable | |||
| 	defer d.adminConfigMtx.RUnlock() | ||||
| 	s, ok := d.externalAlertmanagers[key.OrgID] | ||||
| 	if ok && d.sendAlertsTo[key.OrgID] != models.InternalAlertmanager { | ||||
| 		logger.Debug("sending alerts to external notifier", "count", len(alerts.PostableAlerts), "alerts", alerts.PostableAlerts) | ||||
| 		logger.Info("Sending alerts to external notifier", "count", len(alerts.PostableAlerts)) | ||||
| 		s.SendAlerts(alerts) | ||||
| 		externalNotifierExist = true | ||||
| 	} | ||||
| 
 | ||||
| 	if !localNotifierExist && !externalNotifierExist { | ||||
| 		logger.Error("no external or internal notifier - [%d] alerts not delivered", len(alerts.PostableAlerts)) | ||||
| 		logger.Error("No external or internal notifier - alerts not delivered", "count", len(alerts.PostableAlerts)) | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
|  | @ -327,7 +320,7 @@ func (d *AlertsRouter) Run(ctx context.Context) error { | |||
| 		select { | ||||
| 		case <-time.After(d.adminConfigPollInterval): | ||||
| 			if err := d.SyncAndApplyConfigFromDatabase(); err != nil { | ||||
| 				d.logger.Error("unable to sync admin configuration", "error", err) | ||||
| 				d.logger.Error("Unable to sync admin configuration", "error", err) | ||||
| 			} | ||||
| 		case <-ctx.Done(): | ||||
| 			// Stop sending alerts to all external Alertmanager(s).
 | ||||
|  |  | |||
|  | @ -42,8 +42,8 @@ type ExternalAlertmanager struct { | |||
| 	sdManager *discovery.Manager | ||||
| } | ||||
| 
 | ||||
| func NewExternalAlertmanagerSender() (*ExternalAlertmanager, error) { | ||||
| 	l := log.New("sender") | ||||
| func NewExternalAlertmanagerSender() *ExternalAlertmanager { | ||||
| 	l := log.New("ngalert.sender.external-alertmanager") | ||||
| 	sdCtx, sdCancel := context.WithCancel(context.Background()) | ||||
| 	s := &ExternalAlertmanager{ | ||||
| 		logger:   l, | ||||
|  | @ -59,7 +59,7 @@ func NewExternalAlertmanagerSender() (*ExternalAlertmanager, error) { | |||
| 
 | ||||
| 	s.sdManager = discovery.NewManager(sdCtx, s.logger) | ||||
| 
 | ||||
| 	return s, nil | ||||
| 	return s | ||||
| } | ||||
| 
 | ||||
| // ApplyConfig syncs a configuration with the sender.
 | ||||
|  | @ -69,6 +69,9 @@ func (s *ExternalAlertmanager) ApplyConfig(cfg *ngmodels.AdminConfiguration) err | |||
| 		return err | ||||
| 	} | ||||
| 
 | ||||
| 	s.logger = s.logger.New("org", cfg.OrgID, "cfg", cfg.ID) | ||||
| 
 | ||||
| 	s.logger.Info("Synchronizing config with external Alertmanager group") | ||||
| 	if err := s.manager.ApplyConfig(notifierCfg); err != nil { | ||||
| 		return err | ||||
| 	} | ||||
|  | @ -85,8 +88,10 @@ func (s *ExternalAlertmanager) Run() { | |||
| 	s.wg.Add(2) | ||||
| 
 | ||||
| 	go func() { | ||||
| 		s.logger.Info("Initiating communication with a group of external Alertmanagers") | ||||
| 
 | ||||
| 		if err := s.sdManager.Run(); err != nil { | ||||
| 			s.logger.Error("failed to start the sender service discovery manager", "error", err) | ||||
| 			s.logger.Error("Failed to start the sender service discovery manager", "error", err) | ||||
| 		} | ||||
| 		s.wg.Done() | ||||
| 	}() | ||||
|  | @ -100,7 +105,6 @@ func (s *ExternalAlertmanager) Run() { | |||
| // SendAlerts sends a set of alerts to the configured Alertmanager(s).
 | ||||
| func (s *ExternalAlertmanager) SendAlerts(alerts apimodels.PostableAlerts) { | ||||
| 	if len(alerts.PostableAlerts) == 0 { | ||||
| 		s.logger.Debug("no alerts to send to external Alertmanager(s)") | ||||
| 		return | ||||
| 	} | ||||
| 	as := make([]*notifier.Alert, 0, len(alerts.PostableAlerts)) | ||||
|  | @ -109,12 +113,12 @@ func (s *ExternalAlertmanager) SendAlerts(alerts apimodels.PostableAlerts) { | |||
| 		as = append(as, na) | ||||
| 	} | ||||
| 
 | ||||
| 	s.logger.Debug("sending alerts to the external Alertmanager(s)", "am_count", len(s.manager.Alertmanagers()), "alert_count", len(as)) | ||||
| 	s.manager.Send(as...) | ||||
| } | ||||
| 
 | ||||
| // Stop shuts down the sender.
 | ||||
| func (s *ExternalAlertmanager) Stop() { | ||||
| 	s.logger.Info("Shutting down communication with the external Alertmanager group") | ||||
| 	s.sdCancel() | ||||
| 	s.manager.Stop() | ||||
| 	s.wg.Wait() | ||||
|  | @ -197,14 +201,14 @@ func (s *ExternalAlertmanager) sanitizeLabelSet(lbls models.LabelSet) labels.Lab | |||
| 	for _, k := range sortedKeys(lbls) { | ||||
| 		sanitizedLabelName, err := s.sanitizeLabelName(k) | ||||
| 		if err != nil { | ||||
| 			s.logger.Error("alert sending to external Alertmanager(s) contains an invalid label/annotation name that failed to sanitize, skipping", "name", k, "error", err) | ||||
| 			s.logger.Error("Alert sending to external Alertmanager(s) contains an invalid label/annotation name that failed to sanitize, skipping", "name", k, "error", err) | ||||
| 			continue | ||||
| 		} | ||||
| 
 | ||||
| 		// There can be label name collisions after we sanitize. We check for this and attempt to make the name unique again using a short hash of the original name.
 | ||||
| 		if _, ok := set[sanitizedLabelName]; ok { | ||||
| 			sanitizedLabelName = sanitizedLabelName + fmt.Sprintf("_%.3x", md5.Sum([]byte(k))) | ||||
| 			s.logger.Warn("alert contains duplicate label/annotation name after sanitization, appending unique suffix", "name", k, "new_name", sanitizedLabelName, "error", err) | ||||
| 			s.logger.Warn("Alert contains duplicate label/annotation name after sanitization, appending unique suffix", "name", k, "newName", sanitizedLabelName, "error", err) | ||||
| 		} | ||||
| 
 | ||||
| 		set[sanitizedLabelName] = struct{}{} | ||||
|  | @ -227,7 +231,7 @@ func (s *ExternalAlertmanager) sanitizeLabelName(name string) (string, error) { | |||
| 		return name, nil | ||||
| 	} | ||||
| 
 | ||||
| 	s.logger.Warn("alert sending to external Alertmanager(s) contains label/annotation name with invalid characters", "name", name) | ||||
| 	s.logger.Warn("Alert sending to external Alertmanager(s) contains label/annotation name with invalid characters", "name", name) | ||||
| 
 | ||||
| 	// Remove spaces. We do this instead of replacing with underscore for backwards compatibility as this existed before the rest of this function.
 | ||||
| 	sanitized := strings.Join(strings.Fields(name), "") | ||||
|  |  | |||
|  | @ -48,7 +48,7 @@ func TestSanitizeLabelName(t *testing.T) { | |||
| 	} | ||||
| 
 | ||||
| 	for _, tc := range cases { | ||||
| 		am, _ := NewExternalAlertmanagerSender() | ||||
| 		am := NewExternalAlertmanagerSender() | ||||
| 		t.Run(tc.desc, func(t *testing.T) { | ||||
| 			res, err := am.sanitizeLabelName(tc.labelName) | ||||
| 
 | ||||
|  | @ -95,7 +95,7 @@ func TestSanitizeLabelSet(t *testing.T) { | |||
| 	} | ||||
| 
 | ||||
| 	for _, tc := range cases { | ||||
| 		am, _ := NewExternalAlertmanagerSender() | ||||
| 		am := NewExternalAlertmanagerSender() | ||||
| 		t.Run(tc.desc, func(t *testing.T) { | ||||
| 			require.Equal(t, tc.expectedResult, am.sanitizeLabelSet(tc.labelset)) | ||||
| 		}) | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue