mirror of https://github.com/grafana/grafana.git
Alerting: Add state history backend to write ALERTS metric (#104361)
**What is this feature?** This PR implements a new Prometheus historian backend that allows Grafana alerting to write alert state history as Prometheus-compatible `ALERTS` metrics to remote Prometheus-compatible data sources. The metric includes a few additional labels: * `grafana_alertstate`: Grafana's full alert state, more granular than Prometheus. * `grafana_rule_uid`: Grafana's alert rule UID. Grafana states are included in the `grafana_alertstate` label also mapped to Prometheus-compatible `alertstate` values: | Grafana alert state | `alertstate` | `grafana_alertstate` | |---------------------|-----------------------|-----------------------| | `Alerting` | `firing` | `alerting` | | `Recovering` | `firing` | `recovering` | | `Pending` | `pending` | `pending` | | `Error` | `firing` | `error` | | `NoData` | `firing` | `nodata` | | `Normal` | _(no metric emitted)_ | _(no metric emitted)_ |
This commit is contained in:
parent
5448e8fb22
commit
ad683f83ff
|
@ -1475,8 +1475,10 @@ disabled_labels =
|
|||
# Enable the state history functionality in Unified Alerting. The previous states of alert rules will be visible in panels and in the UI.
|
||||
enabled = true
|
||||
|
||||
# Select which pluggable state history backend to use. Either "annotations", "loki", or "multiple"
|
||||
# "loki" writes state history to an external Loki instance. "multiple" allows history to be written to multiple backends at once.
|
||||
# Select which pluggable state history backend to use. Either "annotations", "loki", "prometheus", or "multiple"
|
||||
# "loki" writes state history to an external Loki instance.
|
||||
# "prometheus" writes state history as ALERTS metrics to a Prometheus-compatible data source.
|
||||
# "multiple" allows history to be written to multiple backends at once.
|
||||
# Defaults to "annotations".
|
||||
backend =
|
||||
|
||||
|
@ -1526,6 +1528,18 @@ loki_max_query_length = 721h
|
|||
# Default is 64kb
|
||||
loki_max_query_size = 65536
|
||||
|
||||
# For "prometheus" only.
|
||||
# Target datasource UID for writing ALERTS metrics.
|
||||
prometheus_target_datasource_uid =
|
||||
|
||||
# For "prometheus" only.
|
||||
# Metric name for the ALERTS metric. Default is "ALERTS".
|
||||
prometheus_metric_name = ALERTS
|
||||
|
||||
# For "prometheus" only.
|
||||
# Timeout for writing ALERTS metrics to the target datasource. Default is 10s.
|
||||
prometheus_write_timeout = 10s
|
||||
|
||||
[unified_alerting.state_history.external_labels]
|
||||
# Optional extra labels to attach to outbound state history records or log streams.
|
||||
# Any number of label key-value-pairs can be provided.
|
||||
|
|
|
@ -1454,8 +1454,10 @@ disabled_labels =
|
|||
# Enable the state history functionality in Unified Alerting. The previous states of alert rules will be visible in panels and in the UI.
|
||||
; enabled = true
|
||||
|
||||
# Select which pluggable state history backend to use. Either "annotations", "loki", or "multiple"
|
||||
# "loki" writes state history to an external Loki instance. "multiple" allows history to be written to multiple backends at once.
|
||||
# Select which pluggable state history backend to use. Either "annotations", "loki", "prometheus", or "multiple"
|
||||
# "loki" writes state history to an external Loki instance.
|
||||
# "prometheus" writes state history as ALERTS metrics to a Prometheus-compatible data source.
|
||||
# "multiple" allows history to be written to multiple backends at once.
|
||||
# Defaults to "annotations".
|
||||
; backend = "multiple"
|
||||
|
||||
|
@ -1505,6 +1507,18 @@ disabled_labels =
|
|||
# Default is 64kb
|
||||
;loki_max_query_size = 65536
|
||||
|
||||
# For "prometheus" only.
|
||||
# Target datasource UID for writing ALERTS metrics.
|
||||
; prometheus_target_datasource_uid = "my-prometheus-uid"
|
||||
|
||||
# For "prometheus" only.
|
||||
# Metric name for the ALERTS metric. Default is "ALERTS".
|
||||
; prometheus_metric_name = "ALERTS"
|
||||
|
||||
# For "prometheus" only.
|
||||
# Timeout for writing ALERTS metrics to the target datasource. Default is 10s.
|
||||
; prometheus_write_timeout = 10s
|
||||
|
||||
[unified_alerting.state_history.external_labels]
|
||||
# Optional extra labels to attach to outbound state history records or log streams.
|
||||
# Any number of label key-value-pairs can be provided.
|
||||
|
|
|
@ -362,7 +362,21 @@ func (ng *AlertNG) init() error {
|
|||
FeatureToggles: ng.FeatureToggles,
|
||||
}
|
||||
|
||||
history, err := configureHistorianBackend(initCtx, ng.Cfg.UnifiedAlerting.StateHistory, ng.annotationsRepo, ng.dashboardService, ng.store, ng.Metrics.GetHistorianMetrics(), ng.Log, ng.tracer, ac.NewRuleService(ng.accesscontrol))
|
||||
history, err := configureHistorianBackend(
|
||||
initCtx,
|
||||
ng.Cfg.UnifiedAlerting.StateHistory,
|
||||
ng.annotationsRepo,
|
||||
ng.dashboardService,
|
||||
ng.store,
|
||||
ng.Metrics.GetHistorianMetrics(),
|
||||
ng.Log,
|
||||
ng.tracer,
|
||||
ac.NewRuleService(ng.accesscontrol),
|
||||
ng.DataSourceService,
|
||||
ng.httpClientProvider,
|
||||
clk,
|
||||
ng.Metrics.GetRemoteWriterMetrics(),
|
||||
)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
@ -606,7 +620,21 @@ type Historian interface {
|
|||
state.Historian
|
||||
}
|
||||
|
||||
func configureHistorianBackend(ctx context.Context, cfg setting.UnifiedAlertingStateHistorySettings, ar annotations.Repository, ds dashboards.DashboardService, rs historian.RuleStore, met *metrics.Historian, l log.Logger, tracer tracing.Tracer, ac historian.AccessControl) (Historian, error) {
|
||||
func configureHistorianBackend(
|
||||
ctx context.Context,
|
||||
cfg setting.UnifiedAlertingStateHistorySettings,
|
||||
ar annotations.Repository,
|
||||
ds dashboards.DashboardService,
|
||||
rs historian.RuleStore,
|
||||
met *metrics.Historian,
|
||||
l log.Logger,
|
||||
tracer tracing.Tracer,
|
||||
ac historian.AccessControl,
|
||||
datasourceService datasources.DataSourceService,
|
||||
httpClientProvider httpclient.Provider,
|
||||
clock clock.Clock,
|
||||
mw *metrics.RemoteWriter,
|
||||
) (Historian, error) {
|
||||
if !cfg.Enabled {
|
||||
met.Info.WithLabelValues("noop").Set(0)
|
||||
return historian.NewNopHistorian(), nil
|
||||
|
@ -621,7 +649,7 @@ func configureHistorianBackend(ctx context.Context, cfg setting.UnifiedAlertingS
|
|||
if backend == historian.BackendTypeMultiple {
|
||||
primaryCfg := cfg
|
||||
primaryCfg.Backend = cfg.MultiPrimary
|
||||
primary, err := configureHistorianBackend(ctx, primaryCfg, ar, ds, rs, met, l, tracer, ac)
|
||||
primary, err := configureHistorianBackend(ctx, primaryCfg, ar, ds, rs, met, l, tracer, ac, datasourceService, httpClientProvider, clock, mw)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("multi-backend target \"%s\" was misconfigured: %w", cfg.MultiPrimary, err)
|
||||
}
|
||||
|
@ -630,7 +658,7 @@ func configureHistorianBackend(ctx context.Context, cfg setting.UnifiedAlertingS
|
|||
for _, b := range cfg.MultiSecondaries {
|
||||
secCfg := cfg
|
||||
secCfg.Backend = b
|
||||
sec, err := configureHistorianBackend(ctx, secCfg, ar, ds, rs, met, l, tracer, ac)
|
||||
sec, err := configureHistorianBackend(ctx, secCfg, ar, ds, rs, met, l, tracer, ac, datasourceService, httpClientProvider, clock, mw)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("multi-backend target \"%s\" was miconfigured: %w", b, err)
|
||||
}
|
||||
|
@ -642,7 +670,8 @@ func configureHistorianBackend(ctx context.Context, cfg setting.UnifiedAlertingS
|
|||
}
|
||||
if backend == historian.BackendTypeAnnotations {
|
||||
store := historian.NewAnnotationStore(ar, ds, met)
|
||||
annotationBackendLogger := log.New("ngalert.state.historian", "backend", "annotations")
|
||||
logCtx := log.WithContextualAttributes(ctx, []any{"backend", "annotations"})
|
||||
annotationBackendLogger := log.New("ngalert.state.historian").FromContext(logCtx)
|
||||
return historian.NewAnnotationBackend(annotationBackendLogger, store, rs, met, ac), nil
|
||||
}
|
||||
if backend == historian.BackendTypeLoki {
|
||||
|
@ -651,7 +680,8 @@ func configureHistorianBackend(ctx context.Context, cfg setting.UnifiedAlertingS
|
|||
return nil, fmt.Errorf("invalid remote loki configuration: %w", err)
|
||||
}
|
||||
req := historian.NewRequester()
|
||||
lokiBackendLogger := log.New("ngalert.state.historian", "backend", "loki")
|
||||
logCtx := log.WithContextualAttributes(ctx, []any{"backend", "loki"})
|
||||
lokiBackendLogger := log.New("ngalert.state.historian").FromContext(logCtx)
|
||||
backend := historian.NewRemoteLokiBackend(lokiBackendLogger, lcfg, req, met, tracer, rs, ac)
|
||||
|
||||
testConnCtx, cancelFunc := context.WithTimeout(ctx, 10*time.Second)
|
||||
|
@ -662,6 +692,25 @@ func configureHistorianBackend(ctx context.Context, cfg setting.UnifiedAlertingS
|
|||
return backend, nil
|
||||
}
|
||||
|
||||
if backend == historian.BackendTypePrometheus {
|
||||
pcfg, err := historian.NewPrometheusConfig(cfg)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("invalid remote prometheus configuration: %w", err)
|
||||
}
|
||||
writerCfg := writer.DatasourceWriterConfig{
|
||||
Timeout: cfg.PrometheusWriteTimeout,
|
||||
}
|
||||
logCtx := log.WithContextualAttributes(ctx, []any{"backend", "prometheus"})
|
||||
prometheusBackendLogger := log.New("ngalert.state.historian").FromContext(logCtx)
|
||||
w := writer.NewDatasourceWriter(writerCfg, datasourceService, httpClientProvider, clock, prometheusBackendLogger, mw)
|
||||
if w == nil {
|
||||
return nil, fmt.Errorf("failed to create alert state metrics writer")
|
||||
}
|
||||
backend := historian.NewRemotePrometheusBackend(pcfg, w, prometheusBackendLogger)
|
||||
|
||||
return backend, nil
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf("unrecognized state history backend: %s", backend)
|
||||
}
|
||||
|
||||
|
|
|
@ -90,7 +90,7 @@ func TestConfigureHistorianBackend(t *testing.T) {
|
|||
}
|
||||
ac := &acfakes.FakeRuleService{}
|
||||
|
||||
_, err := configureHistorianBackend(context.Background(), cfg, nil, nil, nil, met, logger, tracer, ac)
|
||||
_, err := configureHistorianBackend(context.Background(), cfg, nil, nil, nil, met, logger, tracer, ac, nil, nil, nil, nil)
|
||||
|
||||
require.ErrorContains(t, err, "unrecognized")
|
||||
})
|
||||
|
@ -106,7 +106,7 @@ func TestConfigureHistorianBackend(t *testing.T) {
|
|||
}
|
||||
ac := &acfakes.FakeRuleService{}
|
||||
|
||||
_, err := configureHistorianBackend(context.Background(), cfg, nil, nil, nil, met, logger, tracer, ac)
|
||||
_, err := configureHistorianBackend(context.Background(), cfg, nil, nil, nil, met, logger, tracer, ac, nil, nil, nil, nil)
|
||||
|
||||
require.ErrorContains(t, err, "multi-backend target")
|
||||
require.ErrorContains(t, err, "unrecognized")
|
||||
|
@ -124,7 +124,7 @@ func TestConfigureHistorianBackend(t *testing.T) {
|
|||
}
|
||||
ac := &acfakes.FakeRuleService{}
|
||||
|
||||
_, err := configureHistorianBackend(context.Background(), cfg, nil, nil, nil, met, logger, tracer, ac)
|
||||
_, err := configureHistorianBackend(context.Background(), cfg, nil, nil, nil, met, logger, tracer, ac, nil, nil, nil, nil)
|
||||
|
||||
require.ErrorContains(t, err, "multi-backend target")
|
||||
require.ErrorContains(t, err, "unrecognized")
|
||||
|
@ -143,7 +143,42 @@ func TestConfigureHistorianBackend(t *testing.T) {
|
|||
}
|
||||
ac := &acfakes.FakeRuleService{}
|
||||
|
||||
h, err := configureHistorianBackend(context.Background(), cfg, nil, nil, nil, met, logger, tracer, ac)
|
||||
h, err := configureHistorianBackend(context.Background(), cfg, nil, nil, nil, met, logger, tracer, ac, nil, nil, nil, nil)
|
||||
|
||||
require.NotNil(t, h)
|
||||
require.NoError(t, err)
|
||||
})
|
||||
|
||||
t.Run("fail initialization if prometheus backend missing datasource UID", func(t *testing.T) {
|
||||
met := metrics.NewHistorianMetrics(prometheus.NewRegistry(), metrics.Subsystem)
|
||||
logger := log.NewNopLogger()
|
||||
tracer := tracing.InitializeTracerForTest()
|
||||
cfg := setting.UnifiedAlertingStateHistorySettings{
|
||||
Enabled: true,
|
||||
Backend: "prometheus",
|
||||
// Missing PrometheusTargetDatasourceUID
|
||||
}
|
||||
ac := &acfakes.FakeRuleService{}
|
||||
|
||||
_, err := configureHistorianBackend(context.Background(), cfg, nil, nil, nil, met, logger, tracer, ac, nil, nil, nil, nil)
|
||||
|
||||
require.Error(t, err)
|
||||
require.ErrorContains(t, err, "datasource UID must not be empty")
|
||||
})
|
||||
|
||||
t.Run("successful initialization of prometheus backend", func(t *testing.T) {
|
||||
met := metrics.NewHistorianMetrics(prometheus.NewRegistry(), metrics.Subsystem)
|
||||
logger := log.NewNopLogger()
|
||||
tracer := tracing.InitializeTracerForTest()
|
||||
cfg := setting.UnifiedAlertingStateHistorySettings{
|
||||
Enabled: true,
|
||||
Backend: "prometheus",
|
||||
PrometheusMetricName: "test_metric",
|
||||
PrometheusTargetDatasourceUID: "test-prometheus-uid",
|
||||
}
|
||||
ac := &acfakes.FakeRuleService{}
|
||||
|
||||
h, err := configureHistorianBackend(context.Background(), cfg, nil, nil, nil, met, logger, tracer, ac, nil, nil, nil, nil)
|
||||
|
||||
require.NotNil(t, h)
|
||||
require.NoError(t, err)
|
||||
|
@ -160,7 +195,7 @@ func TestConfigureHistorianBackend(t *testing.T) {
|
|||
}
|
||||
ac := &acfakes.FakeRuleService{}
|
||||
|
||||
h, err := configureHistorianBackend(context.Background(), cfg, nil, nil, nil, met, logger, tracer, ac)
|
||||
h, err := configureHistorianBackend(context.Background(), cfg, nil, nil, nil, met, logger, tracer, ac, nil, nil, nil, nil)
|
||||
|
||||
require.NotNil(t, h)
|
||||
require.NoError(t, err)
|
||||
|
@ -183,7 +218,7 @@ grafana_alerting_state_history_info{backend="annotations"} 1
|
|||
}
|
||||
ac := &acfakes.FakeRuleService{}
|
||||
|
||||
h, err := configureHistorianBackend(context.Background(), cfg, nil, nil, nil, met, logger, tracer, ac)
|
||||
h, err := configureHistorianBackend(context.Background(), cfg, nil, nil, nil, met, logger, tracer, ac, nil, nil, nil, nil)
|
||||
|
||||
require.NotNil(t, h)
|
||||
require.NoError(t, err)
|
||||
|
|
|
@ -17,6 +17,7 @@ const (
|
|||
BackendTypeAnnotations BackendType = "annotations"
|
||||
BackendTypeLoki BackendType = "loki"
|
||||
BackendTypeMultiple BackendType = "multiple"
|
||||
BackendTypePrometheus BackendType = "prometheus"
|
||||
BackendTypeNoop BackendType = "noop"
|
||||
)
|
||||
|
||||
|
@ -27,6 +28,7 @@ func ParseBackendType(s string) (BackendType, error) {
|
|||
BackendTypeAnnotations: {},
|
||||
BackendTypeLoki: {},
|
||||
BackendTypeMultiple: {},
|
||||
BackendTypePrometheus: {},
|
||||
BackendTypeNoop: {},
|
||||
}
|
||||
p := BackendType(norm)
|
||||
|
|
|
@ -0,0 +1,242 @@
|
|||
package historian
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"maps"
|
||||
"math"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/grafana/dataplane/sdata/numeric"
|
||||
"github.com/grafana/grafana-plugin-sdk-go/data"
|
||||
promValue "github.com/prometheus/prometheus/model/value"
|
||||
|
||||
"github.com/grafana/grafana/pkg/infra/log"
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/eval"
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/models"
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/state"
|
||||
history_model "github.com/grafana/grafana/pkg/services/ngalert/state/historian/model"
|
||||
"github.com/grafana/grafana/pkg/setting"
|
||||
)
|
||||
|
||||
const (
|
||||
// Label names for the alert metric.
|
||||
alertNameLabel = "alertname"
|
||||
// alertStateLabel is the label used to indicate
|
||||
// the Prometheus-style alert state: firing or pending.
|
||||
alertStateLabel = "alertstate"
|
||||
// grafanaAlertStateLabel is the label used to indicate the Grafana-style
|
||||
// alert state: alerting, pending, recovering, etc.
|
||||
grafanaAlertStateLabel = "grafana_alertstate"
|
||||
alertRuleUIDLabel = "grafana_rule_uid"
|
||||
)
|
||||
|
||||
// isMetricEmittingState defines which evaluation states should emit ALERTS metrics.
|
||||
// Basically every state that is not Normal should emit metrics currently,
|
||||
// and is defined here as an allowed state.
|
||||
func isMetricEmittingState(state eval.State) bool {
|
||||
metricEmittingStates := map[eval.State]struct{}{
|
||||
eval.Alerting: {},
|
||||
eval.Pending: {},
|
||||
eval.Recovering: {},
|
||||
eval.Error: {},
|
||||
eval.NoData: {},
|
||||
}
|
||||
|
||||
_, ok := metricEmittingStates[state]
|
||||
|
||||
return ok
|
||||
}
|
||||
|
||||
// getPrometheusState maps Grafana states to Prometheus alert states.
|
||||
// In Prometheus, the alertstate label in the ALERTS metric can be either "firing" or "pending",
|
||||
// so we need to convert Grafana states accordingly.
|
||||
func getPrometheusState(grafanaState eval.State) string {
|
||||
if grafanaState == eval.Recovering || grafanaState == eval.Alerting || grafanaState == eval.Error || grafanaState == eval.NoData {
|
||||
return "firing"
|
||||
}
|
||||
|
||||
return strings.ToLower(grafanaState.String())
|
||||
}
|
||||
|
||||
type seriesWriter interface {
|
||||
WriteDatasource(ctx context.Context, dsUID string, name string, t time.Time, frames data.Frames, orgID int64, extraLabels map[string]string) error
|
||||
}
|
||||
|
||||
type PrometheusConfig struct {
|
||||
DatasourceUID string
|
||||
MetricName string
|
||||
}
|
||||
|
||||
func NewPrometheusConfig(cfg setting.UnifiedAlertingStateHistorySettings) (PrometheusConfig, error) {
|
||||
if cfg.PrometheusTargetDatasourceUID == "" {
|
||||
return PrometheusConfig{}, errors.New("datasource UID must not be empty")
|
||||
}
|
||||
|
||||
if cfg.PrometheusMetricName == "" {
|
||||
return PrometheusConfig{}, errors.New("metric name must not be empty")
|
||||
}
|
||||
|
||||
return PrometheusConfig{
|
||||
DatasourceUID: cfg.PrometheusTargetDatasourceUID,
|
||||
MetricName: cfg.PrometheusMetricName,
|
||||
}, nil
|
||||
}
|
||||
|
||||
type RemotePrometheusBackend struct {
|
||||
cfg PrometheusConfig
|
||||
promWriter seriesWriter
|
||||
logger log.Logger
|
||||
}
|
||||
|
||||
func NewRemotePrometheusBackend(cfg PrometheusConfig, promWriter seriesWriter, logger log.Logger) *RemotePrometheusBackend {
|
||||
logger.Info("Initializing remote Prometheus backend", "datasourceUID", cfg.DatasourceUID)
|
||||
|
||||
return &RemotePrometheusBackend{
|
||||
cfg: cfg,
|
||||
promWriter: promWriter,
|
||||
logger: logger,
|
||||
}
|
||||
}
|
||||
|
||||
func (b *RemotePrometheusBackend) Query(ctx context.Context, query models.HistoryQuery) (*data.Frame, error) {
|
||||
return nil, fmt.Errorf("prometheus historian backend does not support querying")
|
||||
}
|
||||
|
||||
func (b *RemotePrometheusBackend) Record(ctx context.Context, rule history_model.RuleMeta, transitions []state.StateTransition) <-chan error {
|
||||
errCh := make(chan error, 1)
|
||||
|
||||
if len(transitions) == 0 {
|
||||
errCh <- nil
|
||||
close(errCh)
|
||||
return errCh
|
||||
}
|
||||
|
||||
logger := b.logger.FromContext(ctx)
|
||||
|
||||
var frames data.Frames
|
||||
|
||||
for _, t := range transitions {
|
||||
transitionFrames := b.framesFor(ctx, rule, t)
|
||||
frames = append(frames, transitionFrames...)
|
||||
}
|
||||
|
||||
if len(frames) == 0 {
|
||||
logger.Debug("No frames generated for alert state metric, nothing to write")
|
||||
errCh <- nil
|
||||
close(errCh)
|
||||
return errCh
|
||||
}
|
||||
|
||||
st := transitions[0]
|
||||
|
||||
go func() {
|
||||
defer func() {
|
||||
if r := recover(); r != nil {
|
||||
logger.Error("Panic in prometheus historian", "error", r)
|
||||
errCh <- fmt.Errorf("prometheus historian panic: %v", r)
|
||||
}
|
||||
close(errCh)
|
||||
}()
|
||||
|
||||
var sendErr error
|
||||
if err := b.promWriter.WriteDatasource(ctx, b.cfg.DatasourceUID, b.cfg.MetricName, st.LastEvaluationTime, frames, st.OrgID, nil); err != nil {
|
||||
logger.Error("Failed to write alert state metrics batch", "error", err)
|
||||
sendErr = err
|
||||
}
|
||||
errCh <- sendErr
|
||||
}()
|
||||
|
||||
return errCh
|
||||
}
|
||||
|
||||
// framesFor converts a single StateTransition to multiple data.Frames to handle
|
||||
// transitions that require both StaleNaN for previous state and active metric for current state.
|
||||
//
|
||||
// StaleNaN: in the case of a transition from a metric-emitting state to a non-emitting state,
|
||||
// or when the series changes from one metric-emitting state to another, we should emit a StaleNaN sample
|
||||
// for the previous state to stop it in Prometheus:
|
||||
// https://prometheus.io/docs/specs/prw/remote_write_spec/#stale-markers
|
||||
func (b *RemotePrometheusBackend) framesFor(ctx context.Context, rule history_model.RuleMeta, t state.StateTransition) []*data.Frame {
|
||||
samples := getSamples(t)
|
||||
if len(samples) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
logger := b.logger.FromContext(ctx)
|
||||
|
||||
baseLabels := removePrivateLabels(t.Labels)
|
||||
baseLabels[alertRuleUIDLabel] = t.AlertRuleUID
|
||||
baseLabels[alertNameLabel] = rule.Title
|
||||
|
||||
frameMeta := &data.FrameMeta{
|
||||
Type: data.FrameTypeNumericMulti,
|
||||
TypeVersion: numeric.MultiFrameVersionLatest,
|
||||
}
|
||||
|
||||
frames := make([]*data.Frame, len(samples))
|
||||
|
||||
for i, sample := range samples {
|
||||
labels := make(data.Labels, len(baseLabels)+2)
|
||||
maps.Copy(labels, baseLabels)
|
||||
labels[alertStateLabel] = sample.promState
|
||||
labels[grafanaAlertStateLabel] = sample.grafanaState
|
||||
|
||||
logger.Debug("Creating metric with labels",
|
||||
"rule_uid", t.AlertRuleUID,
|
||||
"previous_state", t.PreviousState,
|
||||
"current_state", t.State.State,
|
||||
"last_evaluation_time", t.LastEvaluationTime,
|
||||
"rule_title", rule.Title,
|
||||
"labels", labels,
|
||||
"value", sample.value,
|
||||
)
|
||||
|
||||
field := data.NewField("", labels, []float64{sample.value})
|
||||
frames[i] = data.NewFrame(b.cfg.MetricName, field)
|
||||
frames[i].SetMeta(frameMeta)
|
||||
}
|
||||
|
||||
return frames
|
||||
}
|
||||
|
||||
type sample struct {
|
||||
value float64
|
||||
grafanaState string
|
||||
promState string
|
||||
}
|
||||
|
||||
// getSamples generates samples based on the state transition.
|
||||
func getSamples(tr state.StateTransition) []*sample {
|
||||
curr, prev := tr.State.State, tr.PreviousState
|
||||
|
||||
var samples []*sample
|
||||
|
||||
// If transitioning from a metric-emitting state to a different state,
|
||||
// emit a StaleNaN sample for the previous state to stop it in Prometheus.
|
||||
if isMetricEmittingState(prev) && prev != curr {
|
||||
prevState := strings.ToLower(prev.String())
|
||||
prevPromState := getPrometheusState(prev)
|
||||
|
||||
samples = append(samples, &sample{
|
||||
value: math.Float64frombits(promValue.StaleNaN),
|
||||
grafanaState: prevState,
|
||||
promState: prevPromState,
|
||||
})
|
||||
}
|
||||
|
||||
if isMetricEmittingState(curr) {
|
||||
currState := strings.ToLower(curr.String())
|
||||
currPromState := getPrometheusState(curr)
|
||||
|
||||
samples = append(samples, &sample{
|
||||
value: 1.0,
|
||||
grafanaState: currState,
|
||||
promState: currPromState,
|
||||
})
|
||||
}
|
||||
|
||||
return samples
|
||||
}
|
|
@ -0,0 +1,344 @@
|
|||
package historian
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"math"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/grafana/dataplane/sdata/numeric"
|
||||
"github.com/grafana/grafana-plugin-sdk-go/data"
|
||||
promValue "github.com/prometheus/prometheus/model/value"
|
||||
"github.com/stretchr/testify/mock"
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
"github.com/grafana/grafana/pkg/infra/log"
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/eval"
|
||||
ngmodels "github.com/grafana/grafana/pkg/services/ngalert/models"
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/state"
|
||||
history_model "github.com/grafana/grafana/pkg/services/ngalert/state/historian/model"
|
||||
"github.com/grafana/grafana/pkg/setting"
|
||||
)
|
||||
|
||||
const (
|
||||
testMetricName = "test_metric_name"
|
||||
)
|
||||
|
||||
type fakeRemoteWriter struct {
|
||||
mock.Mock
|
||||
}
|
||||
|
||||
func (f *fakeRemoteWriter) WriteDatasource(ctx context.Context, dsUID string, name string, t time.Time, frames data.Frames, orgID int64, extraLabels map[string]string) error {
|
||||
args := f.Called(ctx, dsUID, name, t, frames, orgID, extraLabels)
|
||||
return args.Error(0)
|
||||
}
|
||||
|
||||
type panicRemoteWriter struct {
|
||||
mock.Mock
|
||||
panicMessage string
|
||||
}
|
||||
|
||||
func (p *panicRemoteWriter) WriteDatasource(ctx context.Context, dsUID string, name string, t time.Time, frames data.Frames, orgID int64, extraLabels map[string]string) error {
|
||||
p.Called(ctx, dsUID, name, t, frames, orgID, extraLabels)
|
||||
panic(p.panicMessage)
|
||||
}
|
||||
|
||||
func TestNewRemotePrometheusBackend(t *testing.T) {
|
||||
cfg, err := NewPrometheusConfig(setting.UnifiedAlertingStateHistorySettings{
|
||||
PrometheusTargetDatasourceUID: "test-ds-uid",
|
||||
PrometheusMetricName: testMetricName,
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
fakeWriter := new(fakeRemoteWriter)
|
||||
logger := log.NewNopLogger()
|
||||
|
||||
backend := NewRemotePrometheusBackend(cfg, fakeWriter, logger)
|
||||
|
||||
require.NotNil(t, backend)
|
||||
require.Equal(t, cfg.DatasourceUID, backend.cfg.DatasourceUID)
|
||||
require.Equal(t, fakeWriter, backend.promWriter)
|
||||
require.Equal(t, logger, backend.logger)
|
||||
}
|
||||
|
||||
func createExpectedFrame(t *testing.T, ruleUID, ruleName, promState, grafanaState string, instanceLabels data.Labels, value float64) *data.Frame {
|
||||
t.Helper()
|
||||
|
||||
labels := instanceLabels.Copy()
|
||||
labels[alertRuleUIDLabel] = ruleUID
|
||||
labels[alertNameLabel] = ruleName
|
||||
labels[alertStateLabel] = promState
|
||||
labels[grafanaAlertStateLabel] = grafanaState
|
||||
|
||||
valueField := data.NewField("", labels, []float64{value})
|
||||
|
||||
frame := data.NewFrame(testMetricName, valueField)
|
||||
frame.SetMeta(&data.FrameMeta{
|
||||
Type: data.FrameTypeNumericMulti,
|
||||
TypeVersion: numeric.MultiFrameVersionLatest,
|
||||
})
|
||||
return frame
|
||||
}
|
||||
|
||||
func createTransition(from, to eval.State, orgID int64, now time.Time) state.StateTransition {
|
||||
return state.StateTransition{
|
||||
State: &state.State{AlertRuleUID: "rule-uid", OrgID: orgID, Labels: data.Labels{"instance": "server1"}, State: to, LastEvaluationTime: now},
|
||||
PreviousState: from,
|
||||
}
|
||||
}
|
||||
|
||||
func assertFramesEqual(t *testing.T, actualFrames data.Frames, expectedFrames data.Frames) {
|
||||
t.Helper()
|
||||
|
||||
require.Len(t, actualFrames, len(expectedFrames))
|
||||
|
||||
for i, expectedFrame := range expectedFrames {
|
||||
actualFrame := actualFrames[i]
|
||||
require.Equal(t, expectedFrame.Name, actualFrame.Name)
|
||||
require.Len(t, actualFrame.Fields, 1)
|
||||
|
||||
expectedField := expectedFrame.Fields[0]
|
||||
actualField := actualFrame.Fields[0]
|
||||
|
||||
// Check labels
|
||||
require.Equal(t, expectedField.Labels, actualField.Labels)
|
||||
|
||||
// Check values with NaN handling
|
||||
expectedValue := expectedField.At(0).(float64)
|
||||
actualValue := actualField.At(0).(float64)
|
||||
if math.IsNaN(expectedValue) {
|
||||
require.True(t, math.IsNaN(actualValue))
|
||||
} else {
|
||||
require.Equal(t, expectedValue, actualValue)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestPrometheusBackend_Record(t *testing.T) {
|
||||
cfg := PrometheusConfig{DatasourceUID: "test-ds-uid", MetricName: testMetricName}
|
||||
logger := log.NewNopLogger()
|
||||
ctx := context.Background()
|
||||
orgID := int64(1)
|
||||
now := time.Now()
|
||||
ruleMeta := history_model.RuleMeta{Title: "test rule"}
|
||||
|
||||
testCases := []struct {
|
||||
name string
|
||||
ruleMeta history_model.RuleMeta
|
||||
states []state.StateTransition
|
||||
expectedErr error
|
||||
expectedFrames data.Frames
|
||||
}{
|
||||
{
|
||||
name: "No states",
|
||||
ruleMeta: history_model.RuleMeta{Title: "Test Rule No States"},
|
||||
states: []state.StateTransition{},
|
||||
},
|
||||
{
|
||||
name: "normal state only (no metrics emitted)",
|
||||
ruleMeta: ruleMeta,
|
||||
states: []state.StateTransition{
|
||||
{State: &state.State{AlertRuleUID: "rule-uid-normal", OrgID: orgID, Labels: data.Labels{"label1": "value1"}, State: eval.Normal, LastEvaluationTime: now}},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "remote writer error",
|
||||
ruleMeta: ruleMeta,
|
||||
states: []state.StateTransition{
|
||||
{State: &state.State{AlertRuleUID: "rule-uid-err", OrgID: orgID, Labels: data.Labels{}, State: eval.Alerting, LastEvaluationTime: now}},
|
||||
},
|
||||
expectedFrames: data.Frames{
|
||||
createExpectedFrame(t, "rule-uid-err", "test rule", "firing", "alerting", data.Labels{}, 1),
|
||||
},
|
||||
expectedErr: errors.New("remote write failed"),
|
||||
},
|
||||
{
|
||||
name: "internal labels are skipped",
|
||||
ruleMeta: ruleMeta,
|
||||
states: []state.StateTransition{
|
||||
{
|
||||
State: &state.State{
|
||||
AlertRuleUID: "rule-uid-internal",
|
||||
OrgID: orgID,
|
||||
Labels: data.Labels{ngmodels.AutogeneratedRouteLabel: "ignored", "label1": "value1", "__label2": "value2"},
|
||||
State: eval.Alerting,
|
||||
LastEvaluationTime: now,
|
||||
},
|
||||
},
|
||||
},
|
||||
expectedFrames: data.Frames{
|
||||
createExpectedFrame(t, "rule-uid-internal", "test rule", "firing", "alerting", data.Labels{"label1": "value1"}, 1.0),
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "mixed states (normal, pending, recovering, error, nodata)",
|
||||
ruleMeta: ruleMeta,
|
||||
states: []state.StateTransition{
|
||||
{State: &state.State{AlertRuleUID: "rule-uid-normal", OrgID: orgID, Labels: data.Labels{"state": "normal"}, State: eval.Normal, LastEvaluationTime: now}},
|
||||
{State: &state.State{AlertRuleUID: "rule-uid-pending", OrgID: orgID, Labels: data.Labels{"state": "pending"}, State: eval.Pending, LastEvaluationTime: now}},
|
||||
{State: &state.State{AlertRuleUID: "rule-uid-recovering", OrgID: orgID, Labels: data.Labels{"state": "recovering"}, State: eval.Recovering, LastEvaluationTime: now}},
|
||||
{State: &state.State{AlertRuleUID: "rule-uid-error", OrgID: orgID, Labels: data.Labels{"state": "error"}, State: eval.Error, LastEvaluationTime: now}},
|
||||
{State: &state.State{AlertRuleUID: "rule-uid-nodata", OrgID: orgID, Labels: data.Labels{"state": "nodata"}, State: eval.NoData, LastEvaluationTime: now}},
|
||||
},
|
||||
expectedFrames: data.Frames{
|
||||
createExpectedFrame(t, "rule-uid-pending", "test rule", "pending", "pending", data.Labels{"state": "pending"}, 1.0),
|
||||
createExpectedFrame(t, "rule-uid-recovering", "test rule", "firing", "recovering", data.Labels{"state": "recovering"}, 1.0),
|
||||
createExpectedFrame(t, "rule-uid-error", "test rule", "firing", "error", data.Labels{"state": "error"}, 1.0),
|
||||
createExpectedFrame(t, "rule-uid-nodata", "test rule", "firing", "nodata", data.Labels{"state": "nodata"}, 1.0),
|
||||
},
|
||||
},
|
||||
|
||||
// State transitions - Normal to other states (single active frame)
|
||||
{
|
||||
name: "normal to alerting transition",
|
||||
ruleMeta: ruleMeta,
|
||||
states: []state.StateTransition{createTransition(eval.Normal, eval.Alerting, orgID, now)},
|
||||
expectedFrames: data.Frames{
|
||||
createExpectedFrame(t, "rule-uid", "test rule", "firing", "alerting", data.Labels{"instance": "server1"}, 1.0),
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "normal to pending transition",
|
||||
ruleMeta: ruleMeta,
|
||||
states: []state.StateTransition{createTransition(eval.Normal, eval.Pending, orgID, now)},
|
||||
expectedFrames: data.Frames{
|
||||
createExpectedFrame(t, "rule-uid", "test rule", "pending", "pending", data.Labels{"instance": "server1"}, 1.0),
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "normal to error transition",
|
||||
ruleMeta: ruleMeta,
|
||||
states: []state.StateTransition{createTransition(eval.Normal, eval.Error, orgID, now)},
|
||||
expectedFrames: data.Frames{
|
||||
createExpectedFrame(t, "rule-uid", "test rule", "firing", "error", data.Labels{"instance": "server1"}, 1.0),
|
||||
},
|
||||
},
|
||||
|
||||
// Transitions to Normal (StaleNaN only)
|
||||
{
|
||||
name: "alerting to normal transition",
|
||||
ruleMeta: ruleMeta,
|
||||
states: []state.StateTransition{createTransition(eval.Alerting, eval.Normal, orgID, now)},
|
||||
expectedFrames: data.Frames{
|
||||
createExpectedFrame(t, "rule-uid", "test rule", "firing", "alerting", data.Labels{"instance": "server1"}, math.Float64frombits(promValue.StaleNaN)),
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "error to normal transition",
|
||||
ruleMeta: ruleMeta,
|
||||
states: []state.StateTransition{createTransition(eval.Error, eval.Normal, orgID, now)},
|
||||
expectedFrames: data.Frames{
|
||||
createExpectedFrame(t, "rule-uid", "test rule", "firing", "error", data.Labels{"instance": "server1"}, math.Float64frombits(promValue.StaleNaN)),
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "pending to alerting transition",
|
||||
ruleMeta: ruleMeta,
|
||||
states: []state.StateTransition{createTransition(eval.Pending, eval.Alerting, orgID, now)},
|
||||
expectedFrames: data.Frames{
|
||||
createExpectedFrame(t, "rule-uid", "test rule", "pending", "pending", data.Labels{"instance": "server1"}, math.Float64frombits(promValue.StaleNaN)),
|
||||
createExpectedFrame(t, "rule-uid", "test rule", "firing", "alerting", data.Labels{"instance": "server1"}, 1.0),
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "alerting to recovering transition",
|
||||
ruleMeta: ruleMeta,
|
||||
states: []state.StateTransition{createTransition(eval.Alerting, eval.Recovering, orgID, now)},
|
||||
expectedFrames: data.Frames{
|
||||
createExpectedFrame(t, "rule-uid", "test rule", "firing", "alerting", data.Labels{"instance": "server1"}, math.Float64frombits(promValue.StaleNaN)),
|
||||
createExpectedFrame(t, "rule-uid", "test rule", "firing", "recovering", data.Labels{"instance": "server1"}, 1.0),
|
||||
},
|
||||
},
|
||||
|
||||
// No metric should be written
|
||||
{
|
||||
name: "Normal to Normal transition",
|
||||
ruleMeta: ruleMeta,
|
||||
states: []state.StateTransition{createTransition(eval.Normal, eval.Normal, orgID, now)},
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
fakeWriter := new(fakeRemoteWriter)
|
||||
backend := NewRemotePrometheusBackend(cfg, fakeWriter, logger)
|
||||
|
||||
if tc.expectedFrames != nil {
|
||||
var extraLabels map[string]string
|
||||
fakeWriter.On(
|
||||
"WriteDatasource", ctx, cfg.DatasourceUID, testMetricName, now, mock.Anything, orgID, extraLabels,
|
||||
).Return(tc.expectedErr).Once().Run(func(args mock.Arguments) {
|
||||
if tc.expectedErr == nil {
|
||||
actualFrames := args.Get(4).(data.Frames)
|
||||
assertFramesEqual(t, actualFrames, tc.expectedFrames)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
errCh := backend.Record(ctx, tc.ruleMeta, tc.states)
|
||||
err, ok := <-errCh
|
||||
require.True(t, ok)
|
||||
|
||||
if tc.expectedErr == nil {
|
||||
require.Nil(t, err)
|
||||
} else {
|
||||
require.ErrorIs(t, err, tc.expectedErr)
|
||||
}
|
||||
|
||||
fakeWriter.AssertExpectations(t)
|
||||
if tc.expectedFrames == nil {
|
||||
fakeWriter.AssertNotCalled(t, "WriteDatasource", mock.Anything, mock.Anything, mock.Anything, mock.Anything, mock.Anything, mock.Anything, mock.Anything)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestPrometheusBackend_Query(t *testing.T) {
|
||||
cfg := PrometheusConfig{DatasourceUID: "test-ds-uid", MetricName: testMetricName}
|
||||
logger := log.NewNopLogger()
|
||||
fakeWriter := new(fakeRemoteWriter)
|
||||
|
||||
backend := NewRemotePrometheusBackend(cfg, fakeWriter, logger)
|
||||
|
||||
frame, err := backend.Query(context.Background(), ngmodels.HistoryQuery{})
|
||||
require.Error(t, err)
|
||||
require.Nil(t, frame)
|
||||
require.Contains(t, err.Error(), "prometheus historian backend does not support querying")
|
||||
}
|
||||
|
||||
func TestPrometheusBackend_Record_PanicRecovery(t *testing.T) {
|
||||
cfg := PrometheusConfig{DatasourceUID: "test-ds-uid", MetricName: testMetricName}
|
||||
logger := log.NewNopLogger()
|
||||
ctx := context.Background()
|
||||
orgID := int64(1)
|
||||
now := time.Now()
|
||||
ruleMeta := history_model.RuleMeta{Title: "test rule"}
|
||||
|
||||
panicMessage := "panic in WriteDatasource"
|
||||
panicWriter := &panicRemoteWriter{panicMessage: panicMessage}
|
||||
|
||||
panicWriter.On("WriteDatasource", ctx, cfg.DatasourceUID, testMetricName, now, mock.Anything, orgID, mock.Anything).Once()
|
||||
|
||||
backend := NewRemotePrometheusBackend(cfg, panicWriter, logger)
|
||||
|
||||
states := []state.StateTransition{
|
||||
{State: &state.State{
|
||||
AlertRuleUID: "rule-uid-panic",
|
||||
OrgID: orgID,
|
||||
Labels: data.Labels{"test": "panic"},
|
||||
State: eval.Alerting,
|
||||
LastEvaluationTime: now,
|
||||
}},
|
||||
}
|
||||
|
||||
errCh := backend.Record(ctx, ruleMeta, states)
|
||||
|
||||
err, ok := <-errCh
|
||||
require.True(t, ok)
|
||||
require.Error(t, err)
|
||||
require.ErrorContains(t, err, "prometheus historian panic")
|
||||
require.ErrorContains(t, err, panicMessage)
|
||||
|
||||
panicWriter.AssertExpectations(t)
|
||||
}
|
|
@ -2058,3 +2058,219 @@ func mergeLabels(a, b data.Labels) data.Labels {
|
|||
}
|
||||
return result
|
||||
}
|
||||
|
||||
// TestStateManager_HistorianIntegration tests that the state manager properly sends
|
||||
// all expected state transitions to the historian backend.
|
||||
func TestStateManager_HistorianIntegration(t *testing.T) {
|
||||
baseInterval := 1 * time.Second
|
||||
tN := func(n int) time.Time {
|
||||
return time.Unix(0, 0).UTC().Add(time.Duration(n) * baseInterval)
|
||||
}
|
||||
t1 := tN(1)
|
||||
t2 := tN(2)
|
||||
t3 := tN(3)
|
||||
|
||||
labels1 := data.Labels{"instance": "server1", "job": "webapp"}
|
||||
labels2 := data.Labels{"instance": "server2", "job": "webapp"}
|
||||
|
||||
baseRule := &models.AlertRule{
|
||||
ID: 1,
|
||||
OrgID: 1,
|
||||
Title: "test rule",
|
||||
UID: "test-rule-uid",
|
||||
NamespaceUID: "test-namespace",
|
||||
IntervalSeconds: 10,
|
||||
NoDataState: models.NoData,
|
||||
ExecErrState: models.ErrorErrState,
|
||||
For: 0,
|
||||
}
|
||||
|
||||
type transition struct {
|
||||
previousState eval.State
|
||||
currentState eval.State
|
||||
}
|
||||
|
||||
scenarios := []struct {
|
||||
name string
|
||||
rule *models.AlertRule
|
||||
evaluations map[time.Time][]eval.Result
|
||||
expectedTransitions map[time.Time][]transition
|
||||
}{
|
||||
{
|
||||
name: "1:normal -> 1:alerting -> 1:normal",
|
||||
rule: baseRule,
|
||||
evaluations: map[time.Time][]eval.Result{
|
||||
t1: {
|
||||
{Instance: labels1, State: eval.Normal},
|
||||
},
|
||||
t2: {
|
||||
{Instance: labels1, State: eval.Alerting},
|
||||
},
|
||||
t3: {
|
||||
{Instance: labels1, State: eval.Normal},
|
||||
},
|
||||
},
|
||||
expectedTransitions: map[time.Time][]transition{
|
||||
t1: {
|
||||
{previousState: eval.Normal, currentState: eval.Normal},
|
||||
},
|
||||
t2: {
|
||||
{previousState: eval.Normal, currentState: eval.Alerting},
|
||||
},
|
||||
t3: {
|
||||
{previousState: eval.Alerting, currentState: eval.Normal},
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "1:alerting, 2:alerting -> 2:alerting -> {}",
|
||||
rule: baseRule,
|
||||
evaluations: map[time.Time][]eval.Result{
|
||||
t1: {
|
||||
{Instance: labels1, State: eval.Alerting},
|
||||
{Instance: labels2, State: eval.Alerting},
|
||||
},
|
||||
t2: {
|
||||
// labels1 is missing from this evaluation
|
||||
{Instance: labels2, State: eval.Alerting},
|
||||
},
|
||||
t3: {
|
||||
// Both labels1 and labels2 are missing
|
||||
},
|
||||
},
|
||||
expectedTransitions: map[time.Time][]transition{
|
||||
t1: {
|
||||
{previousState: eval.Normal, currentState: eval.Alerting},
|
||||
{previousState: eval.Normal, currentState: eval.Alerting},
|
||||
},
|
||||
t2: {
|
||||
{previousState: eval.Alerting, currentState: eval.Alerting},
|
||||
{previousState: eval.Alerting, currentState: eval.Alerting},
|
||||
},
|
||||
t3: {
|
||||
{previousState: eval.Alerting, currentState: eval.Alerting},
|
||||
{previousState: eval.Alerting, currentState: eval.Alerting},
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "1:alerting -> {} -> {}",
|
||||
rule: baseRule,
|
||||
evaluations: map[time.Time][]eval.Result{
|
||||
t1: {
|
||||
{Instance: labels1, State: eval.Alerting, EvaluatedAt: t1},
|
||||
},
|
||||
t2: {
|
||||
// labels1 is missing - first missing evaluation
|
||||
},
|
||||
t3: {
|
||||
// labels1 is still missing - second missing evaluation
|
||||
},
|
||||
},
|
||||
expectedTransitions: map[time.Time][]transition{
|
||||
t1: {
|
||||
{previousState: eval.Normal, currentState: eval.Alerting},
|
||||
},
|
||||
t2: {
|
||||
{previousState: eval.Alerting, currentState: eval.Alerting},
|
||||
},
|
||||
t3: {
|
||||
{previousState: eval.Alerting, currentState: eval.Alerting},
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "1:alerting -> 1:recovering -> 1:normal",
|
||||
rule: &models.AlertRule{
|
||||
ID: 1,
|
||||
OrgID: 1,
|
||||
Title: "test rule",
|
||||
UID: "test-rule-uid",
|
||||
NamespaceUID: "test-namespace",
|
||||
IntervalSeconds: 10,
|
||||
NoDataState: models.NoData,
|
||||
ExecErrState: models.ErrorErrState,
|
||||
For: 0,
|
||||
KeepFiringFor: 10,
|
||||
},
|
||||
evaluations: map[time.Time][]eval.Result{
|
||||
t1: {
|
||||
{Instance: labels1, State: eval.Alerting},
|
||||
},
|
||||
t2: {
|
||||
{Instance: labels1, State: eval.Normal},
|
||||
},
|
||||
t3: {
|
||||
{Instance: labels1, State: eval.Normal},
|
||||
},
|
||||
},
|
||||
expectedTransitions: map[time.Time][]transition{
|
||||
t1: {
|
||||
{previousState: eval.Normal, currentState: eval.Alerting},
|
||||
},
|
||||
t2: {
|
||||
{previousState: eval.Alerting, currentState: eval.Recovering},
|
||||
},
|
||||
t3: {
|
||||
{previousState: eval.Recovering, currentState: eval.Normal},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, scenario := range scenarios {
|
||||
t.Run(scenario.name, func(t *testing.T) {
|
||||
historian := &state.FakeHistorian{}
|
||||
|
||||
cfg := state.ManagerCfg{
|
||||
Metrics: metrics.NewNGAlert(prometheus.NewPedanticRegistry()).GetStateMetrics(),
|
||||
ExternalURL: nil,
|
||||
InstanceStore: &state.FakeInstanceStore{},
|
||||
Images: &state.NotAvailableImageService{},
|
||||
Clock: clock.NewMock(),
|
||||
Historian: historian,
|
||||
Tracer: tracing.InitializeTracerForTest(),
|
||||
Log: log.NewNopLogger(),
|
||||
}
|
||||
|
||||
mgr := state.NewManager(cfg, state.NewNoopPersister())
|
||||
|
||||
// Helper function to process one time step and verify historian
|
||||
processTimeStep := func(evalTime time.Time) {
|
||||
results := scenario.evaluations[evalTime]
|
||||
expectedTransitions := scenario.expectedTransitions[evalTime]
|
||||
|
||||
for i := range results {
|
||||
results[i].EvaluatedAt = evalTime
|
||||
}
|
||||
|
||||
// Clear historian state transitions before the evaluation
|
||||
historian.StateTransitions = nil
|
||||
|
||||
mgr.ProcessEvalResults(
|
||||
context.Background(),
|
||||
evalTime,
|
||||
scenario.rule,
|
||||
results,
|
||||
make(data.Labels),
|
||||
nil,
|
||||
)
|
||||
|
||||
// Extract just the data we care about from the actual transitions
|
||||
actualTransitions := make([]transition, len(historian.StateTransitions))
|
||||
for i, t := range historian.StateTransitions {
|
||||
actualTransitions[i] = transition{
|
||||
previousState: t.PreviousState,
|
||||
currentState: t.State.State,
|
||||
}
|
||||
}
|
||||
|
||||
require.ElementsMatch(t, expectedTransitions, actualTransitions)
|
||||
}
|
||||
|
||||
processTimeStep(t1)
|
||||
processTimeStep(t2)
|
||||
processTimeStep(t3)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
|
|
@ -67,6 +67,8 @@ const (
|
|||
lokiDefaultMaxQueryLength = 721 * time.Hour // 30d1h, matches the default value in Loki
|
||||
defaultRecordingRequestTimeout = 10 * time.Second
|
||||
lokiDefaultMaxQuerySize = 65536 // 64kb
|
||||
defaultHistorianPrometheusWriteTimeout = 10 * time.Second
|
||||
defaultHistorianPrometheusMetricName = "ALERTS"
|
||||
)
|
||||
|
||||
var (
|
||||
|
@ -192,6 +194,9 @@ type UnifiedAlertingStateHistorySettings struct {
|
|||
LokiBasicAuthUsername string
|
||||
LokiMaxQueryLength time.Duration
|
||||
LokiMaxQuerySize int
|
||||
PrometheusMetricName string
|
||||
PrometheusTargetDatasourceUID string
|
||||
PrometheusWriteTimeout time.Duration
|
||||
MultiPrimary string
|
||||
MultiSecondaries []string
|
||||
ExternalLabels map[string]string
|
||||
|
@ -460,6 +465,9 @@ func (cfg *Cfg) ReadUnifiedAlertingSettings(iniFile *ini.File) error {
|
|||
LokiMaxQuerySize: stateHistory.Key("loki_max_query_size").MustInt(lokiDefaultMaxQuerySize),
|
||||
MultiPrimary: stateHistory.Key("primary").MustString(""),
|
||||
MultiSecondaries: splitTrim(stateHistory.Key("secondaries").MustString(""), ","),
|
||||
PrometheusMetricName: stateHistory.Key("prometheus_metric_name").MustString(defaultHistorianPrometheusMetricName),
|
||||
PrometheusTargetDatasourceUID: stateHistory.Key("prometheus_target_datasource_uid").MustString(""),
|
||||
PrometheusWriteTimeout: stateHistory.Key("prometheus_write_timeout").MustDuration(defaultHistorianPrometheusWriteTimeout),
|
||||
ExternalLabels: stateHistoryLabels.KeysHash(),
|
||||
}
|
||||
uaCfg.StateHistory = uaCfgStateHistory
|
||||
|
|
Loading…
Reference in New Issue