From 55e7cf1aed6ed69ad257b3b01d1a182ea73f7d0f Mon Sep 17 00:00:00 2001 From: gotjosh Date: Tue, 31 Jan 2023 19:54:38 +0000 Subject: [PATCH] Alerting: Introduce Metric Aggregation starting with Silences (#62512) * Alerting: Introduce Metric Aggregation starting with Silences --------- Co-authored-by: Alexander Weaver --- go.mod | 3 + go.sum | 4 +- .../ngalert/metrics/multi_org_alertmanager.go | 125 +++++++++++++++++- pkg/services/ngalert/metrics/ngalert.go | 10 -- 4 files changed, 126 insertions(+), 16 deletions(-) diff --git a/go.mod b/go.mod index e1051fae364..462a4a0d3d8 100644 --- a/go.mod +++ b/go.mod @@ -420,3 +420,6 @@ replace github.com/prometheus/alertmanager => github.com/grafana/prometheus-aler replace google.golang.org/grpc => google.golang.org/grpc v1.45.0 replace google.golang.org/genproto => google.golang.org/genproto v0.0.0-20220421151946-72621c1f0bd3 + +// Remove this once https://github.com/grafana/dskit/pull/258 is merged. +replace github.com/grafana/dskit => github.com/gotjosh/dskit v0.0.0-20230131123646-8dda768daa27 diff --git a/go.sum b/go.sum index c66977ffce0..2542f23cbee 100644 --- a/go.sum +++ b/go.sum @@ -1247,14 +1247,14 @@ github.com/gorilla/websocket v1.4.1/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/ad github.com/gorilla/websocket v1.4.2/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE= github.com/gorilla/websocket v1.5.0 h1:PPwGk2jz7EePpoHN/+ClbZu8SPxiqlu12wZP/3sWmnc= github.com/gorilla/websocket v1.5.0/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE= +github.com/gotjosh/dskit v0.0.0-20230131123646-8dda768daa27 h1:rWMt8wsjGjzT/6AX6/Ie0JTA0CNZzzbDfup34lSJnTw= +github.com/gotjosh/dskit v0.0.0-20230131123646-8dda768daa27/go.mod h1:ulYLLoSd71AWIjxgifLO86Lndx82Yj+IcV+fFnh8tkI= github.com/grafana/alerting v0.0.0-20230125210216-facc6b27b9e0 h1:BzkQNnj+eevX30EMqJiUS1w3CPoGc8kp7pDf/ari/4Y= github.com/grafana/alerting v0.0.0-20230125210216-facc6b27b9e0/go.mod h1:NoSLbfmUwE+omWFReFrLtbtOItmvTbuQERJ6XFYp9ME= github.com/grafana/codejen v0.0.3 h1:tAWxoTUuhgmEqxJPOLtJoxlPBbMULFwKFOcRsPRPXDw= github.com/grafana/codejen v0.0.3/go.mod h1:zmwwM/DRyQB7pfuBjTWII3CWtxcXh8LTwAYGfDfpR6s= github.com/grafana/cuetsy v0.1.5 h1:mnFwAXdbqCsyL8r7kkdUMJ4kOAR26cxIPmrZj7JzTeY= github.com/grafana/cuetsy v0.1.5/go.mod h1:4KWkUOslwvRTpEv7wdQG0jDFTuJmU+0L9x0h4kWxa2A= -github.com/grafana/dskit v0.0.0-20230126115530-71478074eab8 h1:5nqLvzKugVUb9sCQkKuOPecRshawSrbHsXyGxBkTBus= -github.com/grafana/dskit v0.0.0-20230126115530-71478074eab8/go.mod h1:zj+5BNZAVmQafV583uLTAOzRr963KPdEm4d6NPmtbwg= github.com/grafana/go-mssqldb v0.0.0-20210326084033-d0ce3c521036 h1:GplhUk6Xes5JIhUUrggPcPBhOn+eT8+WsHiebvq7GgA= github.com/grafana/go-mssqldb v0.0.0-20210326084033-d0ce3c521036/go.mod h1:xbL0rPBG9cCiLr28tMa8zpbdarY27NDyej4t/EjAShU= github.com/grafana/grafana-aws-sdk v0.12.0 h1:eUjFdFZeZE+nyu/RMRz+qFxTBew69ToLBrbRhTbjkfM= diff --git a/pkg/services/ngalert/metrics/multi_org_alertmanager.go b/pkg/services/ngalert/metrics/multi_org_alertmanager.go index c755457755d..056479c27d8 100644 --- a/pkg/services/ngalert/metrics/multi_org_alertmanager.go +++ b/pkg/services/ngalert/metrics/multi_org_alertmanager.go @@ -1,21 +1,31 @@ package metrics import ( + "fmt" + "strconv" + + "github.com/grafana/grafana/pkg/infra/log" + + "github.com/grafana/dskit/metrics" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promauto" ) type MultiOrgAlertmanager struct { - Registerer prometheus.Registerer + Registerer prometheus.Registerer + registries *metrics.TenantRegistries + ActiveConfigurations prometheus.Gauge DiscoveredConfigurations prometheus.Gauge - registries *OrgRegistries + + aggregatedMetrics *AlertmanagerAggregatedMetrics } func NewMultiOrgAlertmanagerMetrics(r prometheus.Registerer) *MultiOrgAlertmanager { - return &MultiOrgAlertmanager{ + registries := metrics.NewTenantRegistries(log.New("ngalert.multiorg.alertmanager.metrics")) //TODO: Should this be here? Probably not. + moa := &MultiOrgAlertmanager{ Registerer: r, - registries: NewOrgRegistries(), + registries: registries, DiscoveredConfigurations: promauto.With(r).NewGauge(prometheus.GaugeOpts{ Namespace: Namespace, Subsystem: Subsystem, @@ -28,5 +38,112 @@ func NewMultiOrgAlertmanagerMetrics(r prometheus.Registerer) *MultiOrgAlertmanag Name: "active_configurations", Help: "The number of active Alertmanager configurations.", }), + aggregatedMetrics: NewAlertmanagerAggregatedMetrics(registries), } + + // These metrics use a different registration method as the struct itself represents a custom collector. + // There's no way to "auto-register" a collector. + r.MustRegister(moa.aggregatedMetrics) + + return moa +} + +// RemoveOrgRegistry removes the *prometheus.Registry for the specified org. It is safe to call concurrently. +func (moa *MultiOrgAlertmanager) RemoveOrgRegistry(id int64) { + moa.registries.RemoveTenantRegistry(strconv.FormatInt(id, 10), false) +} + +// GetOrCreateOrgRegistry gets or creates a *prometheus.Registry for the specified org. It is safe to call concurrently. +func (moa *MultiOrgAlertmanager) GetOrCreateOrgRegistry(id int64) prometheus.Registerer { + sid := strconv.FormatInt(id, 10) + reg := moa.registries.GetRegistryForTenant(sid) + if reg != nil { + return reg + } + + result := prometheus.NewRegistry() + moa.registries.AddTenantRegistry(sid, result) + + return result +} + +// AlertmanagerAggregatedMetrics are metrics collected directly from the registry. +// Unlike metrics.Alertmanager they are not called within this codebase hence the need for direct collection. +type AlertmanagerAggregatedMetrics struct { + registries *metrics.TenantRegistries + + // exported metrics, gathered from Alertmanager Silences + silencesGCDuration *prometheus.Desc + silencesSnapshotDuration *prometheus.Desc + silencesSnapshotSize *prometheus.Desc + silencesQueriesTotal *prometheus.Desc + silencesQueryErrorsTotal *prometheus.Desc + silencesQueryDuration *prometheus.Desc + silences *prometheus.Desc + silencesPropagatedMessagesTotal *prometheus.Desc +} + +func NewAlertmanagerAggregatedMetrics(registries *metrics.TenantRegistries) *AlertmanagerAggregatedMetrics { + aggregatedMetrics := &AlertmanagerAggregatedMetrics{ + registries: registries, + + silencesGCDuration: prometheus.NewDesc( + fmt.Sprintf("%s_%s_silences_gc_duration_seconds", Namespace, Subsystem), + "Duration of the last silence garbage collection cycle.", + nil, nil), + silencesSnapshotDuration: prometheus.NewDesc( + fmt.Sprintf("%s_%s_silences_snapshot_duration_seconds", Namespace, Subsystem), + "Duration of the last silence snapshot.", + nil, nil), + silencesSnapshotSize: prometheus.NewDesc( + fmt.Sprintf("%s_%s_silences_snapshot_size_bytes", Namespace, Subsystem), + "Size of the last silence snapshot in bytes.", + nil, nil), + silencesQueriesTotal: prometheus.NewDesc( + fmt.Sprintf("%s_%s_silences_queries_total", Namespace, Subsystem), + "How many silence queries were received.", + nil, nil), + silencesQueryErrorsTotal: prometheus.NewDesc( + fmt.Sprintf("%s_%s_silences_query_errors_total", Namespace, Subsystem), + "How many silence received queries did not succeed.", + nil, nil), + silencesQueryDuration: prometheus.NewDesc( + fmt.Sprintf("%s_%s_silences_query_duration_seconds", Namespace, Subsystem), + "Duration of silence query evaluation.", + nil, nil), + silencesPropagatedMessagesTotal: prometheus.NewDesc( + fmt.Sprintf("%s_%s_silences_gossip_messages_propagated_total", Namespace, Subsystem), + "Number of received gossip messages that have been further gossiped.", + nil, nil), + silences: prometheus.NewDesc( + fmt.Sprintf("%s_%s_silences", Namespace, Subsystem), + "How many silences by state.", + []string{"org", "state"}, nil), + } + + return aggregatedMetrics +} + +func (a *AlertmanagerAggregatedMetrics) Describe(out chan<- *prometheus.Desc) { + out <- a.silencesGCDuration + out <- a.silencesSnapshotDuration + out <- a.silencesSnapshotSize + out <- a.silencesQueriesTotal + out <- a.silencesQueryErrorsTotal + out <- a.silencesQueryDuration + out <- a.silencesPropagatedMessagesTotal + out <- a.silences +} + +func (a *AlertmanagerAggregatedMetrics) Collect(out chan<- prometheus.Metric) { + data := a.registries.BuildMetricFamiliesPerTenant() + + data.SendSumOfSummaries(out, a.silencesGCDuration, "alertmanager_silences_gc_duration_seconds") + data.SendSumOfSummaries(out, a.silencesSnapshotDuration, "alertmanager_silences_snapshot_duration_seconds") + data.SendSumOfGauges(out, a.silencesSnapshotSize, "alertmanager_silences_snapshot_size_bytes") + data.SendSumOfCounters(out, a.silencesQueriesTotal, "alertmanager_silences_queries_total") + data.SendSumOfCounters(out, a.silencesQueryErrorsTotal, "alertmanager_silences_query_errors_total") + data.SendSumOfHistograms(out, a.silencesQueryDuration, "alertmanager_silences_query_duration_seconds") + data.SendSumOfCounters(out, a.silencesPropagatedMessagesTotal, "alertmanager_silences_gossip_messages_propagated_total") + data.SendSumOfGaugesPerTenantWithLabels(out, a.silences, "alertmanager_silences", "state") } diff --git a/pkg/services/ngalert/metrics/ngalert.go b/pkg/services/ngalert/metrics/ngalert.go index ae1a304ed34..0983c054606 100644 --- a/pkg/services/ngalert/metrics/ngalert.go +++ b/pkg/services/ngalert/metrics/ngalert.go @@ -57,13 +57,3 @@ func (ng *NGAlert) GetAPIMetrics() *API { func (ng *NGAlert) GetMultiOrgAlertmanagerMetrics() *MultiOrgAlertmanager { return ng.multiOrgAlertmanagerMetrics } - -// RemoveOrgRegistry removes the *prometheus.Registry for the specified org. It is safe to call concurrently. -func (moa *MultiOrgAlertmanager) RemoveOrgRegistry(id int64) { - moa.registries.RemoveOrgRegistry(id) -} - -// GetOrCreateOrgRegistry gets or creates a *prometheus.Registry for the specified org. It is safe to call concurrently. -func (moa *MultiOrgAlertmanager) GetOrCreateOrgRegistry(id int64) prometheus.Registerer { - return moa.registries.GetOrCreateOrgRegistry(id) -}