Alerting: Add state history backend to write ALERTS metric (#104361)

**What is this feature?**

This PR implements a new Prometheus historian backend that allows Grafana alerting to write alert state history as Prometheus-compatible `ALERTS` metrics to remote Prometheus-compatible data sources.

The metric includes a few additional labels:

* `grafana_alertstate`: Grafana's full alert state, more granular than Prometheus.
* `grafana_rule_uid`: Grafana's alert rule UID.

Grafana states are included in the `grafana_alertstate` label also mapped to Prometheus-compatible `alertstate` values:

| Grafana alert state | `alertstate`          | `grafana_alertstate`  |
|---------------------|-----------------------|-----------------------|
| `Alerting`          | `firing`              | `alerting`            |
| `Recovering`        | `firing`              | `recovering`          |
| `Pending`           | `pending`             | `pending`             |
| `Error`             | `firing`              | `error`               |
| `NoData`            | `firing`              | `nodata`              |
| `Normal`            | _(no metric emitted)_ | _(no metric emitted)_ |
This commit is contained in:
Alexander Akhmetov 2025-06-18 07:17:57 +02:00 committed by GitHub
parent 5448e8fb22
commit ad683f83ff
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 965 additions and 41 deletions

View File

@ -1475,8 +1475,10 @@ disabled_labels =
# Enable the state history functionality in Unified Alerting. The previous states of alert rules will be visible in panels and in the UI.
enabled = true
# Select which pluggable state history backend to use. Either "annotations", "loki", or "multiple"
# "loki" writes state history to an external Loki instance. "multiple" allows history to be written to multiple backends at once.
# Select which pluggable state history backend to use. Either "annotations", "loki", "prometheus", or "multiple"
# "loki" writes state history to an external Loki instance.
# "prometheus" writes state history as ALERTS metrics to a Prometheus-compatible data source.
# "multiple" allows history to be written to multiple backends at once.
# Defaults to "annotations".
backend =
@ -1526,6 +1528,18 @@ loki_max_query_length = 721h
# Default is 64kb
loki_max_query_size = 65536
# For "prometheus" only.
# Target datasource UID for writing ALERTS metrics.
prometheus_target_datasource_uid =
# For "prometheus" only.
# Metric name for the ALERTS metric. Default is "ALERTS".
prometheus_metric_name = ALERTS
# For "prometheus" only.
# Timeout for writing ALERTS metrics to the target datasource. Default is 10s.
prometheus_write_timeout = 10s
[unified_alerting.state_history.external_labels]
# Optional extra labels to attach to outbound state history records or log streams.
# Any number of label key-value-pairs can be provided.

View File

@ -1454,8 +1454,10 @@ disabled_labels =
# Enable the state history functionality in Unified Alerting. The previous states of alert rules will be visible in panels and in the UI.
; enabled = true
# Select which pluggable state history backend to use. Either "annotations", "loki", or "multiple"
# "loki" writes state history to an external Loki instance. "multiple" allows history to be written to multiple backends at once.
# Select which pluggable state history backend to use. Either "annotations", "loki", "prometheus", or "multiple"
# "loki" writes state history to an external Loki instance.
# "prometheus" writes state history as ALERTS metrics to a Prometheus-compatible data source.
# "multiple" allows history to be written to multiple backends at once.
# Defaults to "annotations".
; backend = "multiple"
@ -1505,6 +1507,18 @@ disabled_labels =
# Default is 64kb
;loki_max_query_size = 65536
# For "prometheus" only.
# Target datasource UID for writing ALERTS metrics.
; prometheus_target_datasource_uid = "my-prometheus-uid"
# For "prometheus" only.
# Metric name for the ALERTS metric. Default is "ALERTS".
; prometheus_metric_name = "ALERTS"
# For "prometheus" only.
# Timeout for writing ALERTS metrics to the target datasource. Default is 10s.
; prometheus_write_timeout = 10s
[unified_alerting.state_history.external_labels]
# Optional extra labels to attach to outbound state history records or log streams.
# Any number of label key-value-pairs can be provided.

View File

@ -362,7 +362,21 @@ func (ng *AlertNG) init() error {
FeatureToggles: ng.FeatureToggles,
}
history, err := configureHistorianBackend(initCtx, ng.Cfg.UnifiedAlerting.StateHistory, ng.annotationsRepo, ng.dashboardService, ng.store, ng.Metrics.GetHistorianMetrics(), ng.Log, ng.tracer, ac.NewRuleService(ng.accesscontrol))
history, err := configureHistorianBackend(
initCtx,
ng.Cfg.UnifiedAlerting.StateHistory,
ng.annotationsRepo,
ng.dashboardService,
ng.store,
ng.Metrics.GetHistorianMetrics(),
ng.Log,
ng.tracer,
ac.NewRuleService(ng.accesscontrol),
ng.DataSourceService,
ng.httpClientProvider,
clk,
ng.Metrics.GetRemoteWriterMetrics(),
)
if err != nil {
return err
}
@ -606,7 +620,21 @@ type Historian interface {
state.Historian
}
func configureHistorianBackend(ctx context.Context, cfg setting.UnifiedAlertingStateHistorySettings, ar annotations.Repository, ds dashboards.DashboardService, rs historian.RuleStore, met *metrics.Historian, l log.Logger, tracer tracing.Tracer, ac historian.AccessControl) (Historian, error) {
func configureHistorianBackend(
ctx context.Context,
cfg setting.UnifiedAlertingStateHistorySettings,
ar annotations.Repository,
ds dashboards.DashboardService,
rs historian.RuleStore,
met *metrics.Historian,
l log.Logger,
tracer tracing.Tracer,
ac historian.AccessControl,
datasourceService datasources.DataSourceService,
httpClientProvider httpclient.Provider,
clock clock.Clock,
mw *metrics.RemoteWriter,
) (Historian, error) {
if !cfg.Enabled {
met.Info.WithLabelValues("noop").Set(0)
return historian.NewNopHistorian(), nil
@ -621,7 +649,7 @@ func configureHistorianBackend(ctx context.Context, cfg setting.UnifiedAlertingS
if backend == historian.BackendTypeMultiple {
primaryCfg := cfg
primaryCfg.Backend = cfg.MultiPrimary
primary, err := configureHistorianBackend(ctx, primaryCfg, ar, ds, rs, met, l, tracer, ac)
primary, err := configureHistorianBackend(ctx, primaryCfg, ar, ds, rs, met, l, tracer, ac, datasourceService, httpClientProvider, clock, mw)
if err != nil {
return nil, fmt.Errorf("multi-backend target \"%s\" was misconfigured: %w", cfg.MultiPrimary, err)
}
@ -630,7 +658,7 @@ func configureHistorianBackend(ctx context.Context, cfg setting.UnifiedAlertingS
for _, b := range cfg.MultiSecondaries {
secCfg := cfg
secCfg.Backend = b
sec, err := configureHistorianBackend(ctx, secCfg, ar, ds, rs, met, l, tracer, ac)
sec, err := configureHistorianBackend(ctx, secCfg, ar, ds, rs, met, l, tracer, ac, datasourceService, httpClientProvider, clock, mw)
if err != nil {
return nil, fmt.Errorf("multi-backend target \"%s\" was miconfigured: %w", b, err)
}
@ -642,7 +670,8 @@ func configureHistorianBackend(ctx context.Context, cfg setting.UnifiedAlertingS
}
if backend == historian.BackendTypeAnnotations {
store := historian.NewAnnotationStore(ar, ds, met)
annotationBackendLogger := log.New("ngalert.state.historian", "backend", "annotations")
logCtx := log.WithContextualAttributes(ctx, []any{"backend", "annotations"})
annotationBackendLogger := log.New("ngalert.state.historian").FromContext(logCtx)
return historian.NewAnnotationBackend(annotationBackendLogger, store, rs, met, ac), nil
}
if backend == historian.BackendTypeLoki {
@ -651,7 +680,8 @@ func configureHistorianBackend(ctx context.Context, cfg setting.UnifiedAlertingS
return nil, fmt.Errorf("invalid remote loki configuration: %w", err)
}
req := historian.NewRequester()
lokiBackendLogger := log.New("ngalert.state.historian", "backend", "loki")
logCtx := log.WithContextualAttributes(ctx, []any{"backend", "loki"})
lokiBackendLogger := log.New("ngalert.state.historian").FromContext(logCtx)
backend := historian.NewRemoteLokiBackend(lokiBackendLogger, lcfg, req, met, tracer, rs, ac)
testConnCtx, cancelFunc := context.WithTimeout(ctx, 10*time.Second)
@ -662,6 +692,25 @@ func configureHistorianBackend(ctx context.Context, cfg setting.UnifiedAlertingS
return backend, nil
}
if backend == historian.BackendTypePrometheus {
pcfg, err := historian.NewPrometheusConfig(cfg)
if err != nil {
return nil, fmt.Errorf("invalid remote prometheus configuration: %w", err)
}
writerCfg := writer.DatasourceWriterConfig{
Timeout: cfg.PrometheusWriteTimeout,
}
logCtx := log.WithContextualAttributes(ctx, []any{"backend", "prometheus"})
prometheusBackendLogger := log.New("ngalert.state.historian").FromContext(logCtx)
w := writer.NewDatasourceWriter(writerCfg, datasourceService, httpClientProvider, clock, prometheusBackendLogger, mw)
if w == nil {
return nil, fmt.Errorf("failed to create alert state metrics writer")
}
backend := historian.NewRemotePrometheusBackend(pcfg, w, prometheusBackendLogger)
return backend, nil
}
return nil, fmt.Errorf("unrecognized state history backend: %s", backend)
}

View File

@ -90,7 +90,7 @@ func TestConfigureHistorianBackend(t *testing.T) {
}
ac := &acfakes.FakeRuleService{}
_, err := configureHistorianBackend(context.Background(), cfg, nil, nil, nil, met, logger, tracer, ac)
_, err := configureHistorianBackend(context.Background(), cfg, nil, nil, nil, met, logger, tracer, ac, nil, nil, nil, nil)
require.ErrorContains(t, err, "unrecognized")
})
@ -106,7 +106,7 @@ func TestConfigureHistorianBackend(t *testing.T) {
}
ac := &acfakes.FakeRuleService{}
_, err := configureHistorianBackend(context.Background(), cfg, nil, nil, nil, met, logger, tracer, ac)
_, err := configureHistorianBackend(context.Background(), cfg, nil, nil, nil, met, logger, tracer, ac, nil, nil, nil, nil)
require.ErrorContains(t, err, "multi-backend target")
require.ErrorContains(t, err, "unrecognized")
@ -124,7 +124,7 @@ func TestConfigureHistorianBackend(t *testing.T) {
}
ac := &acfakes.FakeRuleService{}
_, err := configureHistorianBackend(context.Background(), cfg, nil, nil, nil, met, logger, tracer, ac)
_, err := configureHistorianBackend(context.Background(), cfg, nil, nil, nil, met, logger, tracer, ac, nil, nil, nil, nil)
require.ErrorContains(t, err, "multi-backend target")
require.ErrorContains(t, err, "unrecognized")
@ -143,7 +143,42 @@ func TestConfigureHistorianBackend(t *testing.T) {
}
ac := &acfakes.FakeRuleService{}
h, err := configureHistorianBackend(context.Background(), cfg, nil, nil, nil, met, logger, tracer, ac)
h, err := configureHistorianBackend(context.Background(), cfg, nil, nil, nil, met, logger, tracer, ac, nil, nil, nil, nil)
require.NotNil(t, h)
require.NoError(t, err)
})
t.Run("fail initialization if prometheus backend missing datasource UID", func(t *testing.T) {
met := metrics.NewHistorianMetrics(prometheus.NewRegistry(), metrics.Subsystem)
logger := log.NewNopLogger()
tracer := tracing.InitializeTracerForTest()
cfg := setting.UnifiedAlertingStateHistorySettings{
Enabled: true,
Backend: "prometheus",
// Missing PrometheusTargetDatasourceUID
}
ac := &acfakes.FakeRuleService{}
_, err := configureHistorianBackend(context.Background(), cfg, nil, nil, nil, met, logger, tracer, ac, nil, nil, nil, nil)
require.Error(t, err)
require.ErrorContains(t, err, "datasource UID must not be empty")
})
t.Run("successful initialization of prometheus backend", func(t *testing.T) {
met := metrics.NewHistorianMetrics(prometheus.NewRegistry(), metrics.Subsystem)
logger := log.NewNopLogger()
tracer := tracing.InitializeTracerForTest()
cfg := setting.UnifiedAlertingStateHistorySettings{
Enabled: true,
Backend: "prometheus",
PrometheusMetricName: "test_metric",
PrometheusTargetDatasourceUID: "test-prometheus-uid",
}
ac := &acfakes.FakeRuleService{}
h, err := configureHistorianBackend(context.Background(), cfg, nil, nil, nil, met, logger, tracer, ac, nil, nil, nil, nil)
require.NotNil(t, h)
require.NoError(t, err)
@ -160,7 +195,7 @@ func TestConfigureHistorianBackend(t *testing.T) {
}
ac := &acfakes.FakeRuleService{}
h, err := configureHistorianBackend(context.Background(), cfg, nil, nil, nil, met, logger, tracer, ac)
h, err := configureHistorianBackend(context.Background(), cfg, nil, nil, nil, met, logger, tracer, ac, nil, nil, nil, nil)
require.NotNil(t, h)
require.NoError(t, err)
@ -183,7 +218,7 @@ grafana_alerting_state_history_info{backend="annotations"} 1
}
ac := &acfakes.FakeRuleService{}
h, err := configureHistorianBackend(context.Background(), cfg, nil, nil, nil, met, logger, tracer, ac)
h, err := configureHistorianBackend(context.Background(), cfg, nil, nil, nil, met, logger, tracer, ac, nil, nil, nil, nil)
require.NotNil(t, h)
require.NoError(t, err)

View File

@ -17,6 +17,7 @@ const (
BackendTypeAnnotations BackendType = "annotations"
BackendTypeLoki BackendType = "loki"
BackendTypeMultiple BackendType = "multiple"
BackendTypePrometheus BackendType = "prometheus"
BackendTypeNoop BackendType = "noop"
)
@ -27,6 +28,7 @@ func ParseBackendType(s string) (BackendType, error) {
BackendTypeAnnotations: {},
BackendTypeLoki: {},
BackendTypeMultiple: {},
BackendTypePrometheus: {},
BackendTypeNoop: {},
}
p := BackendType(norm)

View File

@ -0,0 +1,242 @@
package historian
import (
"context"
"errors"
"fmt"
"maps"
"math"
"strings"
"time"
"github.com/grafana/dataplane/sdata/numeric"
"github.com/grafana/grafana-plugin-sdk-go/data"
promValue "github.com/prometheus/prometheus/model/value"
"github.com/grafana/grafana/pkg/infra/log"
"github.com/grafana/grafana/pkg/services/ngalert/eval"
"github.com/grafana/grafana/pkg/services/ngalert/models"
"github.com/grafana/grafana/pkg/services/ngalert/state"
history_model "github.com/grafana/grafana/pkg/services/ngalert/state/historian/model"
"github.com/grafana/grafana/pkg/setting"
)
const (
// Label names for the alert metric.
alertNameLabel = "alertname"
// alertStateLabel is the label used to indicate
// the Prometheus-style alert state: firing or pending.
alertStateLabel = "alertstate"
// grafanaAlertStateLabel is the label used to indicate the Grafana-style
// alert state: alerting, pending, recovering, etc.
grafanaAlertStateLabel = "grafana_alertstate"
alertRuleUIDLabel = "grafana_rule_uid"
)
// isMetricEmittingState defines which evaluation states should emit ALERTS metrics.
// Basically every state that is not Normal should emit metrics currently,
// and is defined here as an allowed state.
func isMetricEmittingState(state eval.State) bool {
metricEmittingStates := map[eval.State]struct{}{
eval.Alerting: {},
eval.Pending: {},
eval.Recovering: {},
eval.Error: {},
eval.NoData: {},
}
_, ok := metricEmittingStates[state]
return ok
}
// getPrometheusState maps Grafana states to Prometheus alert states.
// In Prometheus, the alertstate label in the ALERTS metric can be either "firing" or "pending",
// so we need to convert Grafana states accordingly.
func getPrometheusState(grafanaState eval.State) string {
if grafanaState == eval.Recovering || grafanaState == eval.Alerting || grafanaState == eval.Error || grafanaState == eval.NoData {
return "firing"
}
return strings.ToLower(grafanaState.String())
}
type seriesWriter interface {
WriteDatasource(ctx context.Context, dsUID string, name string, t time.Time, frames data.Frames, orgID int64, extraLabels map[string]string) error
}
type PrometheusConfig struct {
DatasourceUID string
MetricName string
}
func NewPrometheusConfig(cfg setting.UnifiedAlertingStateHistorySettings) (PrometheusConfig, error) {
if cfg.PrometheusTargetDatasourceUID == "" {
return PrometheusConfig{}, errors.New("datasource UID must not be empty")
}
if cfg.PrometheusMetricName == "" {
return PrometheusConfig{}, errors.New("metric name must not be empty")
}
return PrometheusConfig{
DatasourceUID: cfg.PrometheusTargetDatasourceUID,
MetricName: cfg.PrometheusMetricName,
}, nil
}
type RemotePrometheusBackend struct {
cfg PrometheusConfig
promWriter seriesWriter
logger log.Logger
}
func NewRemotePrometheusBackend(cfg PrometheusConfig, promWriter seriesWriter, logger log.Logger) *RemotePrometheusBackend {
logger.Info("Initializing remote Prometheus backend", "datasourceUID", cfg.DatasourceUID)
return &RemotePrometheusBackend{
cfg: cfg,
promWriter: promWriter,
logger: logger,
}
}
func (b *RemotePrometheusBackend) Query(ctx context.Context, query models.HistoryQuery) (*data.Frame, error) {
return nil, fmt.Errorf("prometheus historian backend does not support querying")
}
func (b *RemotePrometheusBackend) Record(ctx context.Context, rule history_model.RuleMeta, transitions []state.StateTransition) <-chan error {
errCh := make(chan error, 1)
if len(transitions) == 0 {
errCh <- nil
close(errCh)
return errCh
}
logger := b.logger.FromContext(ctx)
var frames data.Frames
for _, t := range transitions {
transitionFrames := b.framesFor(ctx, rule, t)
frames = append(frames, transitionFrames...)
}
if len(frames) == 0 {
logger.Debug("No frames generated for alert state metric, nothing to write")
errCh <- nil
close(errCh)
return errCh
}
st := transitions[0]
go func() {
defer func() {
if r := recover(); r != nil {
logger.Error("Panic in prometheus historian", "error", r)
errCh <- fmt.Errorf("prometheus historian panic: %v", r)
}
close(errCh)
}()
var sendErr error
if err := b.promWriter.WriteDatasource(ctx, b.cfg.DatasourceUID, b.cfg.MetricName, st.LastEvaluationTime, frames, st.OrgID, nil); err != nil {
logger.Error("Failed to write alert state metrics batch", "error", err)
sendErr = err
}
errCh <- sendErr
}()
return errCh
}
// framesFor converts a single StateTransition to multiple data.Frames to handle
// transitions that require both StaleNaN for previous state and active metric for current state.
//
// StaleNaN: in the case of a transition from a metric-emitting state to a non-emitting state,
// or when the series changes from one metric-emitting state to another, we should emit a StaleNaN sample
// for the previous state to stop it in Prometheus:
// https://prometheus.io/docs/specs/prw/remote_write_spec/#stale-markers
func (b *RemotePrometheusBackend) framesFor(ctx context.Context, rule history_model.RuleMeta, t state.StateTransition) []*data.Frame {
samples := getSamples(t)
if len(samples) == 0 {
return nil
}
logger := b.logger.FromContext(ctx)
baseLabels := removePrivateLabels(t.Labels)
baseLabels[alertRuleUIDLabel] = t.AlertRuleUID
baseLabels[alertNameLabel] = rule.Title
frameMeta := &data.FrameMeta{
Type: data.FrameTypeNumericMulti,
TypeVersion: numeric.MultiFrameVersionLatest,
}
frames := make([]*data.Frame, len(samples))
for i, sample := range samples {
labels := make(data.Labels, len(baseLabels)+2)
maps.Copy(labels, baseLabels)
labels[alertStateLabel] = sample.promState
labels[grafanaAlertStateLabel] = sample.grafanaState
logger.Debug("Creating metric with labels",
"rule_uid", t.AlertRuleUID,
"previous_state", t.PreviousState,
"current_state", t.State.State,
"last_evaluation_time", t.LastEvaluationTime,
"rule_title", rule.Title,
"labels", labels,
"value", sample.value,
)
field := data.NewField("", labels, []float64{sample.value})
frames[i] = data.NewFrame(b.cfg.MetricName, field)
frames[i].SetMeta(frameMeta)
}
return frames
}
type sample struct {
value float64
grafanaState string
promState string
}
// getSamples generates samples based on the state transition.
func getSamples(tr state.StateTransition) []*sample {
curr, prev := tr.State.State, tr.PreviousState
var samples []*sample
// If transitioning from a metric-emitting state to a different state,
// emit a StaleNaN sample for the previous state to stop it in Prometheus.
if isMetricEmittingState(prev) && prev != curr {
prevState := strings.ToLower(prev.String())
prevPromState := getPrometheusState(prev)
samples = append(samples, &sample{
value: math.Float64frombits(promValue.StaleNaN),
grafanaState: prevState,
promState: prevPromState,
})
}
if isMetricEmittingState(curr) {
currState := strings.ToLower(curr.String())
currPromState := getPrometheusState(curr)
samples = append(samples, &sample{
value: 1.0,
grafanaState: currState,
promState: currPromState,
})
}
return samples
}

View File

@ -0,0 +1,344 @@
package historian
import (
"context"
"errors"
"math"
"testing"
"time"
"github.com/grafana/dataplane/sdata/numeric"
"github.com/grafana/grafana-plugin-sdk-go/data"
promValue "github.com/prometheus/prometheus/model/value"
"github.com/stretchr/testify/mock"
"github.com/stretchr/testify/require"
"github.com/grafana/grafana/pkg/infra/log"
"github.com/grafana/grafana/pkg/services/ngalert/eval"
ngmodels "github.com/grafana/grafana/pkg/services/ngalert/models"
"github.com/grafana/grafana/pkg/services/ngalert/state"
history_model "github.com/grafana/grafana/pkg/services/ngalert/state/historian/model"
"github.com/grafana/grafana/pkg/setting"
)
const (
testMetricName = "test_metric_name"
)
type fakeRemoteWriter struct {
mock.Mock
}
func (f *fakeRemoteWriter) WriteDatasource(ctx context.Context, dsUID string, name string, t time.Time, frames data.Frames, orgID int64, extraLabels map[string]string) error {
args := f.Called(ctx, dsUID, name, t, frames, orgID, extraLabels)
return args.Error(0)
}
type panicRemoteWriter struct {
mock.Mock
panicMessage string
}
func (p *panicRemoteWriter) WriteDatasource(ctx context.Context, dsUID string, name string, t time.Time, frames data.Frames, orgID int64, extraLabels map[string]string) error {
p.Called(ctx, dsUID, name, t, frames, orgID, extraLabels)
panic(p.panicMessage)
}
func TestNewRemotePrometheusBackend(t *testing.T) {
cfg, err := NewPrometheusConfig(setting.UnifiedAlertingStateHistorySettings{
PrometheusTargetDatasourceUID: "test-ds-uid",
PrometheusMetricName: testMetricName,
})
require.NoError(t, err)
fakeWriter := new(fakeRemoteWriter)
logger := log.NewNopLogger()
backend := NewRemotePrometheusBackend(cfg, fakeWriter, logger)
require.NotNil(t, backend)
require.Equal(t, cfg.DatasourceUID, backend.cfg.DatasourceUID)
require.Equal(t, fakeWriter, backend.promWriter)
require.Equal(t, logger, backend.logger)
}
func createExpectedFrame(t *testing.T, ruleUID, ruleName, promState, grafanaState string, instanceLabels data.Labels, value float64) *data.Frame {
t.Helper()
labels := instanceLabels.Copy()
labels[alertRuleUIDLabel] = ruleUID
labels[alertNameLabel] = ruleName
labels[alertStateLabel] = promState
labels[grafanaAlertStateLabel] = grafanaState
valueField := data.NewField("", labels, []float64{value})
frame := data.NewFrame(testMetricName, valueField)
frame.SetMeta(&data.FrameMeta{
Type: data.FrameTypeNumericMulti,
TypeVersion: numeric.MultiFrameVersionLatest,
})
return frame
}
func createTransition(from, to eval.State, orgID int64, now time.Time) state.StateTransition {
return state.StateTransition{
State: &state.State{AlertRuleUID: "rule-uid", OrgID: orgID, Labels: data.Labels{"instance": "server1"}, State: to, LastEvaluationTime: now},
PreviousState: from,
}
}
func assertFramesEqual(t *testing.T, actualFrames data.Frames, expectedFrames data.Frames) {
t.Helper()
require.Len(t, actualFrames, len(expectedFrames))
for i, expectedFrame := range expectedFrames {
actualFrame := actualFrames[i]
require.Equal(t, expectedFrame.Name, actualFrame.Name)
require.Len(t, actualFrame.Fields, 1)
expectedField := expectedFrame.Fields[0]
actualField := actualFrame.Fields[0]
// Check labels
require.Equal(t, expectedField.Labels, actualField.Labels)
// Check values with NaN handling
expectedValue := expectedField.At(0).(float64)
actualValue := actualField.At(0).(float64)
if math.IsNaN(expectedValue) {
require.True(t, math.IsNaN(actualValue))
} else {
require.Equal(t, expectedValue, actualValue)
}
}
}
func TestPrometheusBackend_Record(t *testing.T) {
cfg := PrometheusConfig{DatasourceUID: "test-ds-uid", MetricName: testMetricName}
logger := log.NewNopLogger()
ctx := context.Background()
orgID := int64(1)
now := time.Now()
ruleMeta := history_model.RuleMeta{Title: "test rule"}
testCases := []struct {
name string
ruleMeta history_model.RuleMeta
states []state.StateTransition
expectedErr error
expectedFrames data.Frames
}{
{
name: "No states",
ruleMeta: history_model.RuleMeta{Title: "Test Rule No States"},
states: []state.StateTransition{},
},
{
name: "normal state only (no metrics emitted)",
ruleMeta: ruleMeta,
states: []state.StateTransition{
{State: &state.State{AlertRuleUID: "rule-uid-normal", OrgID: orgID, Labels: data.Labels{"label1": "value1"}, State: eval.Normal, LastEvaluationTime: now}},
},
},
{
name: "remote writer error",
ruleMeta: ruleMeta,
states: []state.StateTransition{
{State: &state.State{AlertRuleUID: "rule-uid-err", OrgID: orgID, Labels: data.Labels{}, State: eval.Alerting, LastEvaluationTime: now}},
},
expectedFrames: data.Frames{
createExpectedFrame(t, "rule-uid-err", "test rule", "firing", "alerting", data.Labels{}, 1),
},
expectedErr: errors.New("remote write failed"),
},
{
name: "internal labels are skipped",
ruleMeta: ruleMeta,
states: []state.StateTransition{
{
State: &state.State{
AlertRuleUID: "rule-uid-internal",
OrgID: orgID,
Labels: data.Labels{ngmodels.AutogeneratedRouteLabel: "ignored", "label1": "value1", "__label2": "value2"},
State: eval.Alerting,
LastEvaluationTime: now,
},
},
},
expectedFrames: data.Frames{
createExpectedFrame(t, "rule-uid-internal", "test rule", "firing", "alerting", data.Labels{"label1": "value1"}, 1.0),
},
},
{
name: "mixed states (normal, pending, recovering, error, nodata)",
ruleMeta: ruleMeta,
states: []state.StateTransition{
{State: &state.State{AlertRuleUID: "rule-uid-normal", OrgID: orgID, Labels: data.Labels{"state": "normal"}, State: eval.Normal, LastEvaluationTime: now}},
{State: &state.State{AlertRuleUID: "rule-uid-pending", OrgID: orgID, Labels: data.Labels{"state": "pending"}, State: eval.Pending, LastEvaluationTime: now}},
{State: &state.State{AlertRuleUID: "rule-uid-recovering", OrgID: orgID, Labels: data.Labels{"state": "recovering"}, State: eval.Recovering, LastEvaluationTime: now}},
{State: &state.State{AlertRuleUID: "rule-uid-error", OrgID: orgID, Labels: data.Labels{"state": "error"}, State: eval.Error, LastEvaluationTime: now}},
{State: &state.State{AlertRuleUID: "rule-uid-nodata", OrgID: orgID, Labels: data.Labels{"state": "nodata"}, State: eval.NoData, LastEvaluationTime: now}},
},
expectedFrames: data.Frames{
createExpectedFrame(t, "rule-uid-pending", "test rule", "pending", "pending", data.Labels{"state": "pending"}, 1.0),
createExpectedFrame(t, "rule-uid-recovering", "test rule", "firing", "recovering", data.Labels{"state": "recovering"}, 1.0),
createExpectedFrame(t, "rule-uid-error", "test rule", "firing", "error", data.Labels{"state": "error"}, 1.0),
createExpectedFrame(t, "rule-uid-nodata", "test rule", "firing", "nodata", data.Labels{"state": "nodata"}, 1.0),
},
},
// State transitions - Normal to other states (single active frame)
{
name: "normal to alerting transition",
ruleMeta: ruleMeta,
states: []state.StateTransition{createTransition(eval.Normal, eval.Alerting, orgID, now)},
expectedFrames: data.Frames{
createExpectedFrame(t, "rule-uid", "test rule", "firing", "alerting", data.Labels{"instance": "server1"}, 1.0),
},
},
{
name: "normal to pending transition",
ruleMeta: ruleMeta,
states: []state.StateTransition{createTransition(eval.Normal, eval.Pending, orgID, now)},
expectedFrames: data.Frames{
createExpectedFrame(t, "rule-uid", "test rule", "pending", "pending", data.Labels{"instance": "server1"}, 1.0),
},
},
{
name: "normal to error transition",
ruleMeta: ruleMeta,
states: []state.StateTransition{createTransition(eval.Normal, eval.Error, orgID, now)},
expectedFrames: data.Frames{
createExpectedFrame(t, "rule-uid", "test rule", "firing", "error", data.Labels{"instance": "server1"}, 1.0),
},
},
// Transitions to Normal (StaleNaN only)
{
name: "alerting to normal transition",
ruleMeta: ruleMeta,
states: []state.StateTransition{createTransition(eval.Alerting, eval.Normal, orgID, now)},
expectedFrames: data.Frames{
createExpectedFrame(t, "rule-uid", "test rule", "firing", "alerting", data.Labels{"instance": "server1"}, math.Float64frombits(promValue.StaleNaN)),
},
},
{
name: "error to normal transition",
ruleMeta: ruleMeta,
states: []state.StateTransition{createTransition(eval.Error, eval.Normal, orgID, now)},
expectedFrames: data.Frames{
createExpectedFrame(t, "rule-uid", "test rule", "firing", "error", data.Labels{"instance": "server1"}, math.Float64frombits(promValue.StaleNaN)),
},
},
{
name: "pending to alerting transition",
ruleMeta: ruleMeta,
states: []state.StateTransition{createTransition(eval.Pending, eval.Alerting, orgID, now)},
expectedFrames: data.Frames{
createExpectedFrame(t, "rule-uid", "test rule", "pending", "pending", data.Labels{"instance": "server1"}, math.Float64frombits(promValue.StaleNaN)),
createExpectedFrame(t, "rule-uid", "test rule", "firing", "alerting", data.Labels{"instance": "server1"}, 1.0),
},
},
{
name: "alerting to recovering transition",
ruleMeta: ruleMeta,
states: []state.StateTransition{createTransition(eval.Alerting, eval.Recovering, orgID, now)},
expectedFrames: data.Frames{
createExpectedFrame(t, "rule-uid", "test rule", "firing", "alerting", data.Labels{"instance": "server1"}, math.Float64frombits(promValue.StaleNaN)),
createExpectedFrame(t, "rule-uid", "test rule", "firing", "recovering", data.Labels{"instance": "server1"}, 1.0),
},
},
// No metric should be written
{
name: "Normal to Normal transition",
ruleMeta: ruleMeta,
states: []state.StateTransition{createTransition(eval.Normal, eval.Normal, orgID, now)},
},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
fakeWriter := new(fakeRemoteWriter)
backend := NewRemotePrometheusBackend(cfg, fakeWriter, logger)
if tc.expectedFrames != nil {
var extraLabels map[string]string
fakeWriter.On(
"WriteDatasource", ctx, cfg.DatasourceUID, testMetricName, now, mock.Anything, orgID, extraLabels,
).Return(tc.expectedErr).Once().Run(func(args mock.Arguments) {
if tc.expectedErr == nil {
actualFrames := args.Get(4).(data.Frames)
assertFramesEqual(t, actualFrames, tc.expectedFrames)
}
})
}
errCh := backend.Record(ctx, tc.ruleMeta, tc.states)
err, ok := <-errCh
require.True(t, ok)
if tc.expectedErr == nil {
require.Nil(t, err)
} else {
require.ErrorIs(t, err, tc.expectedErr)
}
fakeWriter.AssertExpectations(t)
if tc.expectedFrames == nil {
fakeWriter.AssertNotCalled(t, "WriteDatasource", mock.Anything, mock.Anything, mock.Anything, mock.Anything, mock.Anything, mock.Anything, mock.Anything)
}
})
}
}
func TestPrometheusBackend_Query(t *testing.T) {
cfg := PrometheusConfig{DatasourceUID: "test-ds-uid", MetricName: testMetricName}
logger := log.NewNopLogger()
fakeWriter := new(fakeRemoteWriter)
backend := NewRemotePrometheusBackend(cfg, fakeWriter, logger)
frame, err := backend.Query(context.Background(), ngmodels.HistoryQuery{})
require.Error(t, err)
require.Nil(t, frame)
require.Contains(t, err.Error(), "prometheus historian backend does not support querying")
}
func TestPrometheusBackend_Record_PanicRecovery(t *testing.T) {
cfg := PrometheusConfig{DatasourceUID: "test-ds-uid", MetricName: testMetricName}
logger := log.NewNopLogger()
ctx := context.Background()
orgID := int64(1)
now := time.Now()
ruleMeta := history_model.RuleMeta{Title: "test rule"}
panicMessage := "panic in WriteDatasource"
panicWriter := &panicRemoteWriter{panicMessage: panicMessage}
panicWriter.On("WriteDatasource", ctx, cfg.DatasourceUID, testMetricName, now, mock.Anything, orgID, mock.Anything).Once()
backend := NewRemotePrometheusBackend(cfg, panicWriter, logger)
states := []state.StateTransition{
{State: &state.State{
AlertRuleUID: "rule-uid-panic",
OrgID: orgID,
Labels: data.Labels{"test": "panic"},
State: eval.Alerting,
LastEvaluationTime: now,
}},
}
errCh := backend.Record(ctx, ruleMeta, states)
err, ok := <-errCh
require.True(t, ok)
require.Error(t, err)
require.ErrorContains(t, err, "prometheus historian panic")
require.ErrorContains(t, err, panicMessage)
panicWriter.AssertExpectations(t)
}

View File

@ -2058,3 +2058,219 @@ func mergeLabels(a, b data.Labels) data.Labels {
}
return result
}
// TestStateManager_HistorianIntegration tests that the state manager properly sends
// all expected state transitions to the historian backend.
func TestStateManager_HistorianIntegration(t *testing.T) {
baseInterval := 1 * time.Second
tN := func(n int) time.Time {
return time.Unix(0, 0).UTC().Add(time.Duration(n) * baseInterval)
}
t1 := tN(1)
t2 := tN(2)
t3 := tN(3)
labels1 := data.Labels{"instance": "server1", "job": "webapp"}
labels2 := data.Labels{"instance": "server2", "job": "webapp"}
baseRule := &models.AlertRule{
ID: 1,
OrgID: 1,
Title: "test rule",
UID: "test-rule-uid",
NamespaceUID: "test-namespace",
IntervalSeconds: 10,
NoDataState: models.NoData,
ExecErrState: models.ErrorErrState,
For: 0,
}
type transition struct {
previousState eval.State
currentState eval.State
}
scenarios := []struct {
name string
rule *models.AlertRule
evaluations map[time.Time][]eval.Result
expectedTransitions map[time.Time][]transition
}{
{
name: "1:normal -> 1:alerting -> 1:normal",
rule: baseRule,
evaluations: map[time.Time][]eval.Result{
t1: {
{Instance: labels1, State: eval.Normal},
},
t2: {
{Instance: labels1, State: eval.Alerting},
},
t3: {
{Instance: labels1, State: eval.Normal},
},
},
expectedTransitions: map[time.Time][]transition{
t1: {
{previousState: eval.Normal, currentState: eval.Normal},
},
t2: {
{previousState: eval.Normal, currentState: eval.Alerting},
},
t3: {
{previousState: eval.Alerting, currentState: eval.Normal},
},
},
},
{
name: "1:alerting, 2:alerting -> 2:alerting -> {}",
rule: baseRule,
evaluations: map[time.Time][]eval.Result{
t1: {
{Instance: labels1, State: eval.Alerting},
{Instance: labels2, State: eval.Alerting},
},
t2: {
// labels1 is missing from this evaluation
{Instance: labels2, State: eval.Alerting},
},
t3: {
// Both labels1 and labels2 are missing
},
},
expectedTransitions: map[time.Time][]transition{
t1: {
{previousState: eval.Normal, currentState: eval.Alerting},
{previousState: eval.Normal, currentState: eval.Alerting},
},
t2: {
{previousState: eval.Alerting, currentState: eval.Alerting},
{previousState: eval.Alerting, currentState: eval.Alerting},
},
t3: {
{previousState: eval.Alerting, currentState: eval.Alerting},
{previousState: eval.Alerting, currentState: eval.Alerting},
},
},
},
{
name: "1:alerting -> {} -> {}",
rule: baseRule,
evaluations: map[time.Time][]eval.Result{
t1: {
{Instance: labels1, State: eval.Alerting, EvaluatedAt: t1},
},
t2: {
// labels1 is missing - first missing evaluation
},
t3: {
// labels1 is still missing - second missing evaluation
},
},
expectedTransitions: map[time.Time][]transition{
t1: {
{previousState: eval.Normal, currentState: eval.Alerting},
},
t2: {
{previousState: eval.Alerting, currentState: eval.Alerting},
},
t3: {
{previousState: eval.Alerting, currentState: eval.Alerting},
},
},
},
{
name: "1:alerting -> 1:recovering -> 1:normal",
rule: &models.AlertRule{
ID: 1,
OrgID: 1,
Title: "test rule",
UID: "test-rule-uid",
NamespaceUID: "test-namespace",
IntervalSeconds: 10,
NoDataState: models.NoData,
ExecErrState: models.ErrorErrState,
For: 0,
KeepFiringFor: 10,
},
evaluations: map[time.Time][]eval.Result{
t1: {
{Instance: labels1, State: eval.Alerting},
},
t2: {
{Instance: labels1, State: eval.Normal},
},
t3: {
{Instance: labels1, State: eval.Normal},
},
},
expectedTransitions: map[time.Time][]transition{
t1: {
{previousState: eval.Normal, currentState: eval.Alerting},
},
t2: {
{previousState: eval.Alerting, currentState: eval.Recovering},
},
t3: {
{previousState: eval.Recovering, currentState: eval.Normal},
},
},
},
}
for _, scenario := range scenarios {
t.Run(scenario.name, func(t *testing.T) {
historian := &state.FakeHistorian{}
cfg := state.ManagerCfg{
Metrics: metrics.NewNGAlert(prometheus.NewPedanticRegistry()).GetStateMetrics(),
ExternalURL: nil,
InstanceStore: &state.FakeInstanceStore{},
Images: &state.NotAvailableImageService{},
Clock: clock.NewMock(),
Historian: historian,
Tracer: tracing.InitializeTracerForTest(),
Log: log.NewNopLogger(),
}
mgr := state.NewManager(cfg, state.NewNoopPersister())
// Helper function to process one time step and verify historian
processTimeStep := func(evalTime time.Time) {
results := scenario.evaluations[evalTime]
expectedTransitions := scenario.expectedTransitions[evalTime]
for i := range results {
results[i].EvaluatedAt = evalTime
}
// Clear historian state transitions before the evaluation
historian.StateTransitions = nil
mgr.ProcessEvalResults(
context.Background(),
evalTime,
scenario.rule,
results,
make(data.Labels),
nil,
)
// Extract just the data we care about from the actual transitions
actualTransitions := make([]transition, len(historian.StateTransitions))
for i, t := range historian.StateTransitions {
actualTransitions[i] = transition{
previousState: t.PreviousState,
currentState: t.State.State,
}
}
require.ElementsMatch(t, expectedTransitions, actualTransitions)
}
processTimeStep(t1)
processTimeStep(t2)
processTimeStep(t3)
})
}
}

View File

@ -62,11 +62,13 @@ const (
// with intervals that are not exactly divided by this number not to be evaluated
SchedulerBaseInterval = 10 * time.Second
// DefaultRuleEvaluationInterval indicates a default interval of for how long a rule should be evaluated to change state from Pending to Alerting
DefaultRuleEvaluationInterval = SchedulerBaseInterval * 6 // == 60 seconds
stateHistoryDefaultEnabled = true
lokiDefaultMaxQueryLength = 721 * time.Hour // 30d1h, matches the default value in Loki
defaultRecordingRequestTimeout = 10 * time.Second
lokiDefaultMaxQuerySize = 65536 // 64kb
DefaultRuleEvaluationInterval = SchedulerBaseInterval * 6 // == 60 seconds
stateHistoryDefaultEnabled = true
lokiDefaultMaxQueryLength = 721 * time.Hour // 30d1h, matches the default value in Loki
defaultRecordingRequestTimeout = 10 * time.Second
lokiDefaultMaxQuerySize = 65536 // 64kb
defaultHistorianPrometheusWriteTimeout = 10 * time.Second
defaultHistorianPrometheusMetricName = "ALERTS"
)
var (
@ -188,13 +190,16 @@ type UnifiedAlertingStateHistorySettings struct {
LokiTenantID string
// LokiBasicAuthUsername and LokiBasicAuthPassword are used for basic auth
// if one of them is set.
LokiBasicAuthPassword string
LokiBasicAuthUsername string
LokiMaxQueryLength time.Duration
LokiMaxQuerySize int
MultiPrimary string
MultiSecondaries []string
ExternalLabels map[string]string
LokiBasicAuthPassword string
LokiBasicAuthUsername string
LokiMaxQueryLength time.Duration
LokiMaxQuerySize int
PrometheusMetricName string
PrometheusTargetDatasourceUID string
PrometheusWriteTimeout time.Duration
MultiPrimary string
MultiSecondaries []string
ExternalLabels map[string]string
}
// IsEnabled returns true if UnifiedAlertingSettings.Enabled is either nil or true.
@ -448,19 +453,22 @@ func (cfg *Cfg) ReadUnifiedAlertingSettings(iniFile *ini.File) error {
stateHistory := iniFile.Section("unified_alerting.state_history")
stateHistoryLabels := iniFile.Section("unified_alerting.state_history.external_labels")
uaCfgStateHistory := UnifiedAlertingStateHistorySettings{
Enabled: stateHistory.Key("enabled").MustBool(stateHistoryDefaultEnabled),
Backend: stateHistory.Key("backend").MustString("annotations"),
LokiRemoteURL: stateHistory.Key("loki_remote_url").MustString(""),
LokiReadURL: stateHistory.Key("loki_remote_read_url").MustString(""),
LokiWriteURL: stateHistory.Key("loki_remote_write_url").MustString(""),
LokiTenantID: stateHistory.Key("loki_tenant_id").MustString(""),
LokiBasicAuthUsername: stateHistory.Key("loki_basic_auth_username").MustString(""),
LokiBasicAuthPassword: stateHistory.Key("loki_basic_auth_password").MustString(""),
LokiMaxQueryLength: stateHistory.Key("loki_max_query_length").MustDuration(lokiDefaultMaxQueryLength),
LokiMaxQuerySize: stateHistory.Key("loki_max_query_size").MustInt(lokiDefaultMaxQuerySize),
MultiPrimary: stateHistory.Key("primary").MustString(""),
MultiSecondaries: splitTrim(stateHistory.Key("secondaries").MustString(""), ","),
ExternalLabels: stateHistoryLabels.KeysHash(),
Enabled: stateHistory.Key("enabled").MustBool(stateHistoryDefaultEnabled),
Backend: stateHistory.Key("backend").MustString("annotations"),
LokiRemoteURL: stateHistory.Key("loki_remote_url").MustString(""),
LokiReadURL: stateHistory.Key("loki_remote_read_url").MustString(""),
LokiWriteURL: stateHistory.Key("loki_remote_write_url").MustString(""),
LokiTenantID: stateHistory.Key("loki_tenant_id").MustString(""),
LokiBasicAuthUsername: stateHistory.Key("loki_basic_auth_username").MustString(""),
LokiBasicAuthPassword: stateHistory.Key("loki_basic_auth_password").MustString(""),
LokiMaxQueryLength: stateHistory.Key("loki_max_query_length").MustDuration(lokiDefaultMaxQueryLength),
LokiMaxQuerySize: stateHistory.Key("loki_max_query_size").MustInt(lokiDefaultMaxQuerySize),
MultiPrimary: stateHistory.Key("primary").MustString(""),
MultiSecondaries: splitTrim(stateHistory.Key("secondaries").MustString(""), ","),
PrometheusMetricName: stateHistory.Key("prometheus_metric_name").MustString(defaultHistorianPrometheusMetricName),
PrometheusTargetDatasourceUID: stateHistory.Key("prometheus_target_datasource_uid").MustString(""),
PrometheusWriteTimeout: stateHistory.Key("prometheus_write_timeout").MustDuration(defaultHistorianPrometheusWriteTimeout),
ExternalLabels: stateHistoryLabels.KeysHash(),
}
uaCfg.StateHistory = uaCfgStateHistory