Alerting: Fix resolved notifications for same-label Error to Normal transitions (#106210)
Actionlint / Lint GitHub Actions files (push) Has been cancelled Details
Backend Code Checks / Validate Backend Configs (push) Has been cancelled Details
Backend Unit Tests / Grafana (${{ matrix.shard }}) (1/8) (push) Has been cancelled Details
Backend Unit Tests / Grafana (${{ matrix.shard }}) (2/8) (push) Has been cancelled Details
Backend Unit Tests / Grafana (${{ matrix.shard }}) (3/8) (push) Has been cancelled Details
Backend Unit Tests / Grafana (${{ matrix.shard }}) (4/8) (push) Has been cancelled Details
Backend Unit Tests / Grafana (${{ matrix.shard }}) (5/8) (push) Has been cancelled Details
Backend Unit Tests / Grafana (${{ matrix.shard }}) (6/8) (push) Has been cancelled Details
Backend Unit Tests / Grafana (${{ matrix.shard }}) (7/8) (push) Has been cancelled Details
Backend Unit Tests / Grafana (${{ matrix.shard }}) (8/8) (push) Has been cancelled Details
Backend Unit Tests / Grafana Enterprise (${{ matrix.shard }}) (1/8) (push) Has been cancelled Details
Backend Unit Tests / Grafana Enterprise (${{ matrix.shard }}) (2/8) (push) Has been cancelled Details
Backend Unit Tests / Grafana Enterprise (${{ matrix.shard }}) (3/8) (push) Has been cancelled Details
Backend Unit Tests / Grafana Enterprise (${{ matrix.shard }}) (4/8) (push) Has been cancelled Details
Backend Unit Tests / Grafana Enterprise (${{ matrix.shard }}) (5/8) (push) Has been cancelled Details
Backend Unit Tests / Grafana Enterprise (${{ matrix.shard }}) (6/8) (push) Has been cancelled Details
Backend Unit Tests / Grafana Enterprise (${{ matrix.shard }}) (7/8) (push) Has been cancelled Details
Backend Unit Tests / Grafana Enterprise (${{ matrix.shard }}) (8/8) (push) Has been cancelled Details
CodeQL checks / Analyze (actions) (push) Has been cancelled Details
CodeQL checks / Analyze (go) (push) Has been cancelled Details
CodeQL checks / Analyze (javascript) (push) Has been cancelled Details
Lint Frontend / Verify i18n (push) Has been cancelled Details
Lint Frontend / Lint (push) Has been cancelled Details
Lint Frontend / Typecheck (push) Has been cancelled Details
Lint Frontend / Betterer (push) Has been cancelled Details
golangci-lint / lint-go (push) Has been cancelled Details
End-to-end tests / Build & Package Grafana (push) Has been cancelled Details
Frontend tests / Unit tests (${{ matrix.chunk }} / 8) (1) (push) Has been cancelled Details
Frontend tests / Unit tests (${{ matrix.chunk }} / 8) (2) (push) Has been cancelled Details
Frontend tests / Unit tests (${{ matrix.chunk }} / 8) (3) (push) Has been cancelled Details
Frontend tests / Unit tests (${{ matrix.chunk }} / 8) (4) (push) Has been cancelled Details
Frontend tests / Unit tests (${{ matrix.chunk }} / 8) (5) (push) Has been cancelled Details
Frontend tests / Unit tests (${{ matrix.chunk }} / 8) (6) (push) Has been cancelled Details
Frontend tests / Unit tests (${{ matrix.chunk }} / 8) (7) (push) Has been cancelled Details
Frontend tests / Unit tests (${{ matrix.chunk }} / 8) (8) (push) Has been cancelled Details
Integration Tests / Sqlite (${{ matrix.shard }}) (1/8) (push) Has been cancelled Details
Integration Tests / Sqlite (${{ matrix.shard }}) (2/8) (push) Has been cancelled Details
Integration Tests / Sqlite (${{ matrix.shard }}) (3/8) (push) Has been cancelled Details
Integration Tests / Sqlite (${{ matrix.shard }}) (4/8) (push) Has been cancelled Details
Integration Tests / Sqlite (${{ matrix.shard }}) (5/8) (push) Has been cancelled Details
Integration Tests / Sqlite (${{ matrix.shard }}) (6/8) (push) Has been cancelled Details
Integration Tests / Sqlite (${{ matrix.shard }}) (7/8) (push) Has been cancelled Details
Integration Tests / Sqlite (${{ matrix.shard }}) (8/8) (push) Has been cancelled Details
Integration Tests / MySQL (${{ matrix.shard }}) (1/8) (push) Has been cancelled Details
Integration Tests / MySQL (${{ matrix.shard }}) (2/8) (push) Has been cancelled Details
Integration Tests / MySQL (${{ matrix.shard }}) (3/8) (push) Has been cancelled Details
Integration Tests / MySQL (${{ matrix.shard }}) (4/8) (push) Has been cancelled Details
Integration Tests / MySQL (${{ matrix.shard }}) (5/8) (push) Has been cancelled Details
Integration Tests / MySQL (${{ matrix.shard }}) (6/8) (push) Has been cancelled Details
Integration Tests / MySQL (${{ matrix.shard }}) (7/8) (push) Has been cancelled Details
Integration Tests / MySQL (${{ matrix.shard }}) (8/8) (push) Has been cancelled Details
Integration Tests / Postgres (${{ matrix.shard }}) (1/8) (push) Has been cancelled Details
Integration Tests / Postgres (${{ matrix.shard }}) (2/8) (push) Has been cancelled Details
Integration Tests / Postgres (${{ matrix.shard }}) (3/8) (push) Has been cancelled Details
Integration Tests / Postgres (${{ matrix.shard }}) (4/8) (push) Has been cancelled Details
Integration Tests / Postgres (${{ matrix.shard }}) (5/8) (push) Has been cancelled Details
Integration Tests / Postgres (${{ matrix.shard }}) (6/8) (push) Has been cancelled Details
Integration Tests / Postgres (${{ matrix.shard }}) (7/8) (push) Has been cancelled Details
Integration Tests / Postgres (${{ matrix.shard }}) (8/8) (push) Has been cancelled Details
Reject GitHub secrets / reject-gh-secrets (push) Has been cancelled Details
Run dashboard schema v2 e2e / dashboard-schema-v2-e2e (push) Has been cancelled Details
Dispatch sync to mirror / dispatch-job (push) Has been cancelled Details
End-to-end tests / ${{ matrix.suite }} (dashboards-suite) (push) Has been cancelled Details
End-to-end tests / ${{ matrix.suite }} (panels-suite) (push) Has been cancelled Details
End-to-end tests / ${{ matrix.suite }} (smoke-tests-suite) (push) Has been cancelled Details
End-to-end tests / ${{ matrix.suite }} (various-suite) (push) Has been cancelled Details
End-to-end tests / ${{ matrix.suite }} (old arch) (old-arch/dashboards-suite) (push) Has been cancelled Details
End-to-end tests / ${{ matrix.suite }} (old arch) (old-arch/panels-suite) (push) Has been cancelled Details
End-to-end tests / ${{ matrix.suite }} (old arch) (old-arch/smoke-tests-suite) (push) Has been cancelled Details
End-to-end tests / ${{ matrix.suite }} (old arch) (old-arch/various-suite) (push) Has been cancelled Details

What is this feature?

Ensures that resolved notifications are sent when alert states transition from Error to Normal after the configured number of evaluation intervals: Missing series evaluations to resolve.

Why do we need this feature?

Before this change, when an alert was transitioning from Error to Normal, in case when the labels on the new Normal alert instance are the same, Grafana would not send resolved notifications for the Error alert state. The alert would be resolved after a few evaluation intervals automatically in the alertmanager, following the endsAt.

With this change the resolved notification is sent after the configured number of evaluation intervals: Missing series evaluations to resolve.
This commit is contained in:
Alexander Akhmetov 2025-06-07 14:03:11 +02:00 committed by GitHub
parent a7368e004b
commit 3bb4c92028
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 217 additions and 129 deletions

View File

@ -109,11 +109,27 @@ func expandAnnotationsAndLabels(ctx context.Context, log log.Logger, alertRule *
labels, _ := expand(ctx, log, alertRule.Title, alertRule.Labels, templateData, externalURL, result.EvaluatedAt)
annotations, _ := expand(ctx, log, alertRule.Title, alertRule.Annotations, templateData, externalURL, result.EvaluatedAt)
lbs := make(data.Labels, len(extraLabels)+len(labels)+len(resultLabels))
// If the result contains an error, we want to add the ref_id and datasource_uid labels
// to the new state if the alert rule should be in the ErrorErrState.
var errorLabels data.Labels
if result.State == eval.Error && alertRule.ExecErrState == ngModels.ErrorErrState {
refID, datasourceUID := datasourceErrorInfo(result.Error, alertRule)
if refID != "" || datasourceUID != "" {
errorLabels = data.Labels{
"ref_id": refID,
"datasource_uid": datasourceUID,
}
}
}
lbs := make(data.Labels, len(extraLabels)+len(labels)+len(resultLabels)+len(errorLabels))
dupes := make(data.Labels)
for key, val := range extraLabels {
lbs[key] = val
}
for key, val := range errorLabels {
lbs[key] = val
}
for key, val := range labels {
ruleVal, ok := lbs[key]
// if duplicate labels exist, reserved label will take precedence

View File

@ -3519,7 +3519,6 @@ func TestProcessEvalResults_StateTransitions(t *testing.T) {
{
PreviousState: eval.Normal,
State: &State{
CacheID: labels["system + rule"].Fingerprint(),
Labels: labels["system + rule + datasource-error"],
State: eval.Error,
Error: datasourceError,
@ -3702,7 +3701,6 @@ func TestProcessEvalResults_StateTransitions(t *testing.T) {
{
PreviousState: eval.Normal,
State: &State{
CacheID: labels["system + rule"].Fingerprint(),
Labels: labels["system + rule + datasource-error"],
State: eval.Error,
Error: datasourceError,
@ -3811,7 +3809,6 @@ func TestProcessEvalResults_StateTransitions(t *testing.T) {
{
PreviousState: eval.Normal,
State: &State{
CacheID: labels["system + rule"].Fingerprint(),
Labels: labels["system + rule + datasource-error"],
State: eval.Error,
Error: datasourceError,
@ -3846,7 +3843,6 @@ func TestProcessEvalResults_StateTransitions(t *testing.T) {
{
PreviousState: eval.Error,
State: &State{
CacheID: labels["system + rule"].Fingerprint(),
Labels: labels["system + rule + datasource-error"],
State: eval.Error,
Error: datasourceError,
@ -4006,7 +4002,6 @@ func TestProcessEvalResults_StateTransitions(t *testing.T) {
{
PreviousState: eval.Normal,
State: &State{
CacheID: labels["system + rule"].Fingerprint(),
Labels: labels["system + rule + datasource-error"],
State: eval.Error,
Error: datasourceError,
@ -4111,7 +4106,6 @@ func TestProcessEvalResults_StateTransitions(t *testing.T) {
{
PreviousState: eval.Error,
State: &State{
CacheID: labels["system + rule"].Fingerprint(),
Labels: labels["system + rule + datasource-error"],
Error: datasourceError,
State: eval.Normal,
@ -4244,6 +4238,22 @@ func TestProcessEvalResults_StateTransitions(t *testing.T) {
t2: {
{
PreviousState: eval.Error,
State: &State{
Labels: labels["system + rule + datasource-error"],
State: eval.Error,
Error: datasourceError,
LatestResult: newEvaluation(t1, eval.Error),
StartsAt: t1,
EndsAt: t1.Add(ResendDelay * 4),
LastEvaluationTime: t1,
LastSentAt: &t1,
Annotations: mergeLabels(baseRule.Annotations, data.Labels{
"Error": datasourceError.Error(),
}),
},
},
{
PreviousState: eval.Normal,
State: &State{
Labels: labels["system + rule"],
State: eval.Normal,
@ -4251,11 +4261,28 @@ func TestProcessEvalResults_StateTransitions(t *testing.T) {
StartsAt: t2,
EndsAt: t2,
LastEvaluationTime: t2,
LastSentAt: &t1,
},
},
},
t3: {
{
PreviousState: eval.Error,
State: &State{
Labels: labels["system + rule + datasource-error"],
State: eval.Normal,
StateReason: "MissingSeries",
LatestResult: newEvaluation(t1, eval.Error),
StartsAt: t1,
EndsAt: t3,
ResolvedAt: &t3,
LastEvaluationTime: t3,
LastSentAt: &t3,
Error: datasourceError,
Annotations: mergeLabels(baseRule.Annotations, data.Labels{
"Error": datasourceError.Error(),
}),
},
},
{
PreviousState: eval.Normal,
State: &State{
@ -4265,7 +4292,6 @@ func TestProcessEvalResults_StateTransitions(t *testing.T) {
StartsAt: t2,
EndsAt: t2,
LastEvaluationTime: t3,
LastSentAt: &t1,
},
},
},
@ -4279,7 +4305,6 @@ func TestProcessEvalResults_StateTransitions(t *testing.T) {
StartsAt: t2,
EndsAt: t2,
LastEvaluationTime: t4,
LastSentAt: &t1,
},
},
},
@ -4442,7 +4467,6 @@ func TestProcessEvalResults_StateTransitions(t *testing.T) {
{
PreviousState: eval.Error,
State: &State{
CacheID: labels["system + rule"].Fingerprint(),
Labels: labels["system + rule + datasource-error"],
Error: datasourceError,
State: eval.Error,
@ -4460,6 +4484,22 @@ func TestProcessEvalResults_StateTransitions(t *testing.T) {
t3: {
{
PreviousState: eval.Error,
State: &State{
Labels: labels["system + rule + datasource-error"],
State: eval.Error,
Error: datasourceError,
LatestResult: newEvaluation(t2, eval.Error),
StartsAt: t1,
EndsAt: t2.Add(ResendDelay * 4),
LastEvaluationTime: t2,
LastSentAt: &t1,
Annotations: mergeLabels(baseRule.Annotations, data.Labels{
"Error": datasourceError.Error(),
}),
},
},
{
PreviousState: eval.Normal,
State: &State{
Labels: labels["system + rule"],
State: eval.Normal,
@ -4467,7 +4507,6 @@ func TestProcessEvalResults_StateTransitions(t *testing.T) {
StartsAt: t3,
EndsAt: t3,
LastEvaluationTime: t3,
LastSentAt: &t1,
},
},
},
@ -4475,12 +4514,22 @@ func TestProcessEvalResults_StateTransitions(t *testing.T) {
{
PreviousState: eval.Normal,
State: &State{
CacheID: labels["system + rule"].Fingerprint(),
Labels: labels["system + rule"],
State: eval.Normal,
LatestResult: newEvaluation(t3, eval.Normal),
StartsAt: t3,
EndsAt: t3,
LastEvaluationTime: t3,
},
},
{
PreviousState: eval.Error,
State: &State{
Labels: labels["system + rule + datasource-error"],
State: eval.Error,
LatestResult: newEvaluation(t4, eval.Error),
Error: datasourceError,
StartsAt: t4,
StartsAt: t1,
EndsAt: t4.Add(ResendDelay * 4),
LastEvaluationTime: t4,
LastSentAt: &t4,
@ -4658,7 +4707,17 @@ func TestProcessEvalResults_StateTransitions(t *testing.T) {
{
PreviousState: eval.Normal,
State: &State{
CacheID: labels["system + rule"].Fingerprint(),
Labels: labels["system + rule"],
State: eval.Normal,
LatestResult: newEvaluation(t1, eval.Normal),
StartsAt: t1,
EndsAt: t1,
LastEvaluationTime: t1,
},
},
{
PreviousState: eval.Normal,
State: &State{
Labels: labels["system + rule + datasource-error"],
State: eval.Error,
Error: datasourceError,
@ -4760,7 +4819,17 @@ func TestProcessEvalResults_StateTransitions(t *testing.T) {
{
PreviousState: eval.Pending,
State: &State{
CacheID: labels["system + rule"].Fingerprint(),
Labels: labels["system + rule"],
State: eval.Pending,
LatestResult: newEvaluation(t1, eval.Alerting),
StartsAt: t1,
EndsAt: t1.Add(ResendDelay * 4),
LastEvaluationTime: t1,
},
},
{
PreviousState: eval.Normal,
State: &State{
Labels: labels["system + rule + datasource-error"],
State: eval.Error,
Error: datasourceError,
@ -4854,7 +4923,17 @@ func TestProcessEvalResults_StateTransitions(t *testing.T) {
{
PreviousState: eval.Pending,
State: &State{
CacheID: labels["system + rule"].Fingerprint(),
Labels: labels["system + rule"],
State: eval.Pending,
LatestResult: newEvaluation(t1, eval.Alerting),
StartsAt: t1,
EndsAt: t1.Add(ResendDelay * 4),
LastEvaluationTime: t1,
},
},
{
PreviousState: eval.Normal,
State: &State{
Labels: labels["system + rule + datasource-error"],
State: eval.Error,
Error: datasourceError,
@ -4872,14 +4951,31 @@ func TestProcessEvalResults_StateTransitions(t *testing.T) {
t3: {
{
PreviousState: eval.Error,
State: &State{
Labels: labels["system + rule + datasource-error"],
State: eval.Error,
Error: datasourceError,
LatestResult: newEvaluation(t2, eval.Error),
StartsAt: t2,
EndsAt: t2.Add(ResendDelay * 4),
LastEvaluationTime: t2,
LastSentAt: &t2,
Annotations: mergeLabels(baseRule.Annotations, data.Labels{
"Error": datasourceError.Error(),
}),
},
},
{
PreviousState: eval.Pending,
State: &State{
Labels: labels["system + rule"],
State: eval.Pending,
State: eval.Alerting,
LatestResult: newEvaluation(t3, eval.Alerting),
StartsAt: t3,
EndsAt: t3.Add(ResendDelay * 4),
FiredAt: &t3,
LastEvaluationTime: t3,
LastSentAt: &t2,
LastSentAt: &t3,
},
},
},
@ -4999,6 +5095,22 @@ func TestProcessEvalResults_StateTransitions(t *testing.T) {
t2: {
{
PreviousState: eval.Error,
State: &State{
Labels: labels["system + rule + datasource-error"],
State: eval.Error,
LatestResult: newEvaluation(t1, eval.Error),
StartsAt: t1,
EndsAt: t1.Add(ResendDelay * 4),
LastEvaluationTime: t1,
LastSentAt: &t1,
Error: datasourceError,
Annotations: mergeLabels(baseRule.Annotations, data.Labels{
"Error": datasourceError.Error(),
}),
},
},
{
PreviousState: eval.Normal,
State: &State{
Labels: labels["system + rule"],
State: eval.Normal,
@ -5006,7 +5118,6 @@ func TestProcessEvalResults_StateTransitions(t *testing.T) {
StartsAt: t2,
EndsAt: t2,
LastEvaluationTime: t2,
LastSentAt: &t1, // TODO: Fix me. This should be t2 since we should be resolving the previous DatasourceError alert.
},
},
},
@ -5081,7 +5192,6 @@ func TestProcessEvalResults_StateTransitions(t *testing.T) {
{
PreviousState: eval.Normal,
State: &State{
CacheID: labels["system + rule"].Fingerprint(),
Labels: labels["system + rule + datasource-error"],
State: eval.Error,
Error: datasourceError,
@ -5100,15 +5210,30 @@ func TestProcessEvalResults_StateTransitions(t *testing.T) {
{
PreviousState: eval.Error,
State: &State{
CacheID: labels["system + rule"].Fingerprint(),
Labels: labels["system + rule + datasource-error"],
State: eval.Error,
Error: datasourceError,
LatestResult: newEvaluation(t1, eval.Error),
StartsAt: t1,
EndsAt: t1.Add(ResendDelay * 4),
LastEvaluationTime: t1,
LastSentAt: &t1,
Annotations: mergeLabels(baseRule.Annotations, data.Labels{
"Error": datasourceError.Error(),
}),
},
},
{
PreviousState: eval.Normal,
State: &State{
Labels: labels["system + rule"],
State: eval.Error,
Error: genericError,
LatestResult: newEvaluation(t2, eval.Error),
StartsAt: t1,
StartsAt: t2,
EndsAt: t2.Add(ResendDelay * 4),
LastEvaluationTime: t2,
LastSentAt: &t1,
LastSentAt: &t2,
Annotations: genericErrorAnnotations,
},
},

View File

@ -1016,7 +1016,7 @@ func TestProcessEvalResults(t *testing.T) {
// TODO(@moustafab): figure out why this test doesn't fail as is
desc: "classic condition, execution Error as Error (alerting -> query error -> alerting)",
alertRule: baseRuleWith(m.WithErrorExecAs(models.ErrorErrState)),
expectedAnnotations: 3,
expectedAnnotations: 2,
evalResults: map[time.Time]eval.Results{
t1: {
newResult(eval.WithState(eval.Alerting), eval.WithLabels(data.Labels{})),
@ -1030,15 +1030,33 @@ func TestProcessEvalResults(t *testing.T) {
},
expectedStates: []*state.State{
{
Labels: labels["system + rule"],
Labels: data.Labels{"label": "test", "system": "owned"},
ResultFingerprint: data.Labels{}.Fingerprint(),
State: eval.Alerting,
LatestResult: newEvaluation(t3, eval.Alerting),
StartsAt: t3,
StartsAt: t1,
EndsAt: t3.Add(state.ResendDelay * 4),
FiredAt: &t3,
FiredAt: &t1,
LastEvaluationTime: t3,
LastSentAt: &t1, // Resend delay is 30s, so last sent at is t1.
LastSentAt: &t1,
Annotations: map[string]string{
"annotation": "test",
},
},
{
Labels: data.Labels{"system": "owned", "label": "test", "ref_id": "A", "datasource_uid": "datasource_uid_1"},
ResultFingerprint: data.Labels{}.Fingerprint(),
State: eval.Error,
LatestResult: newEvaluation(t2, eval.Error),
StartsAt: t2,
EndsAt: t2.Add(state.ResendDelay * 4),
LastEvaluationTime: t2,
LastSentAt: &t2,
Error: expr.MakeQueryError("A", "test-datasource-uid", errors.New("this is an error")),
Annotations: map[string]string{
"Error": "[sse.dataQueryError] failed to execute query [A]: this is an error",
"annotation": "test",
},
},
},
},

View File

@ -219,16 +219,30 @@ func (a *State) Maintain(interval int64, evaluatedAt time.Time) {
a.EndsAt = nextEndsTime(interval, evaluatedAt)
}
// AddErrorInformation adds annotations to the state to indicate that an error occurred.
// If addDatasourceInfoToLabels is true, the ref_id and datasource_uid are added to the labels,
// otherwise, they are added to the annotations.
func (a *State) AddErrorInformation(err error, rule *models.AlertRule, addDatasourceInfoToLabels bool) {
// addErrorInfoToAnnotations adds annotations to the state to indicate that an error occurred.
func (a *State) addErrorInfoToAnnotations(err error, rule *models.AlertRule) {
if err == nil {
return
}
a.Annotations["Error"] = err.Error()
refID, datasourceUID := datasourceErrorInfo(err, rule)
if refID != "" || datasourceUID != "" {
a.Annotations["ref_id"] = refID
a.Annotations["datasource_uid"] = datasourceUID
} else {
// Remove the ref_id and datasource_uid from the annotations if they are present.
// It can happen if the alert state hasn't changed, but the error is different now.
delete(a.Annotations, "ref_id")
delete(a.Annotations, "datasource_uid")
}
}
// datasourceErrorInfo returns ref_id and datasource_uid if the evaluation
// failed because a query returned an error.
func datasourceErrorInfo(err error, rule *models.AlertRule) (string, string) {
// If the evaluation failed because a query returned an error then add the Ref ID and
// Datasource UID as labels or annotations
var utilError errutil.Error
@ -236,22 +250,12 @@ func (a *State) AddErrorInformation(err error, rule *models.AlertRule, addDataso
(errors.Is(err, expr.QueryError) || errors.Is(err, expr.ConversionError)) {
for _, next := range rule.Data {
if next.RefID == utilError.PublicPayload["refId"].(string) {
if addDatasourceInfoToLabels {
a.Labels["ref_id"] = next.RefID
a.Labels["datasource_uid"] = next.DatasourceUID
} else {
a.Annotations["ref_id"] = next.RefID
a.Annotations["datasource_uid"] = next.DatasourceUID
}
break
return next.RefID, next.DatasourceUID
}
}
} else {
// Remove the ref_id and datasource_uid from the annotations if they are present.
// It can happen if the alert state hasn't changed, but the error is different now.
delete(a.Annotations, "ref_id")
delete(a.Annotations, "datasource_uid")
}
return "", ""
}
func (a *State) SetNextValues(result eval.Result) {
@ -464,12 +468,11 @@ func resultError(state *State, rule *models.AlertRule, result eval.Result, logge
resultAlerting(state, rule, result, logger, models.StateReasonError)
// This is a special case where Alerting and Pending should also have an error and reason
state.Error = result.Error
state.AddErrorInformation(result.Error, rule, false)
state.addErrorInfoToAnnotations(result.Error, rule)
case models.ErrorErrState:
if state.State == eval.Error {
prevEndsAt := state.EndsAt
state.Error = result.Error
state.AddErrorInformation(result.Error, rule, true)
state.Maintain(rule.IntervalSeconds, result.EvaluatedAt)
logger.Debug("Keeping state",
"state",
@ -491,20 +494,23 @@ func resultError(state *State, rule *models.AlertRule, result eval.Result, logge
"next_ends_at",
nextEndsAt)
state.SetError(result.Error, result.EvaluatedAt, nextEndsAt)
state.AddErrorInformation(result.Error, rule, true)
}
// TODO: always add annotations
if result.Error != nil {
state.Annotations["Error"] = result.Error.Error()
}
case models.OkErrState:
logger.Debug("Execution error state is Normal", "handler", "resultNormal", "previous_handler", handlerStr)
resultNormal(state, rule, result, logger, "") // TODO: Should we add a reason?
state.AddErrorInformation(result.Error, rule, false)
state.addErrorInfoToAnnotations(result.Error, rule)
case models.KeepLastErrState:
logger := logger.New("previous_handler", handlerStr)
resultKeepLast(state, rule, result, logger)
state.AddErrorInformation(result.Error, rule, false)
state.addErrorInfoToAnnotations(result.Error, rule)
default:
err := fmt.Errorf("unsupported execution error state: %s", rule.ExecErrState)
state.SetError(err, state.StartsAt, nextEndsTime(rule.IntervalSeconds, result.EvaluatedAt))
state.AddErrorInformation(result.Error, rule, false)
state.addErrorInfoToAnnotations(result.Error, rule)
}
}
@ -798,20 +804,6 @@ func patch(newState, existingState *State, result eval.Result) {
newState.Annotations[key] = value
}
}
// if the current state is "data source error" then it may have additional labels that may not exist in the new state.
// See https://github.com/grafana/grafana/blob/c7fdf8ce706c2c9d438f5e6eabd6e580bac4946b/pkg/services/ngalert/state/state.go#L161-L163
// copy known labels over to the new instance, it can help reduce flapping
// TODO fix this?
if existingState.State == eval.Error && result.State == eval.Error {
setIfExist := func(lbl string) {
if v, ok := existingState.Labels[lbl]; ok {
newState.Labels[lbl] = v
}
}
setIfExist("datasource_uid")
setIfExist("ref_id")
}
}
func (a *State) transition(alertRule *models.AlertRule, result eval.Result, extraAnnotations data.Labels, logger log.Logger, takeImageFn takeImageFn) StateTransition {
@ -831,27 +823,6 @@ func (a *State) transition(alertRule *models.AlertRule, result eval.Result, extr
// Add the instance to the log context to help correlate log lines for a state
logger = logger.New("instance", result.Instance)
// if the current state is Error but the result is different, then we need o clean up the extra labels
// that were added after the state key was calculated
// https://github.com/grafana/grafana/blob/1df4d332c982dc5e394201bb2ef35b442727ce63/pkg/services/ngalert/state/state.go#L298-L311
// Usually, it happens in the case of classic conditions when the evalResult does not have labels.
//
// This is temporary change to make sure that the labels are not persistent in the state after it was in Error state
// TODO yuri. Remove it when correct Error result with labels is provided
if a.State == eval.Error && result.State != eval.Error {
// This is possible because state was updated after the CacheID was calculated.
_, curOk := a.Labels["ref_id"]
_, resOk := result.Instance["ref_id"]
if curOk && !resOk {
delete(a.Labels, "ref_id")
}
_, curOk = a.Labels["datasource_uid"]
_, resOk = result.Instance["datasource_uid"]
if curOk && !resOk {
delete(a.Labels, "datasource_uid")
}
}
switch result.State {
case eval.Normal:
logger.Debug("Setting next state", "handler", "resultNormal")

View File

@ -1091,48 +1091,6 @@ func TestPatch(t *testing.T) {
assert.Equal(t, orig.LastEvaluationTime, state.LastEvaluationTime)
assert.Equal(t, orig.EvaluationDuration, state.EvaluationDuration)
})
t.Run("if result Error and current state is Error it should copy datasource_uid and ref_id labels", func(t *testing.T) {
state := randomSate(key)
orig := state.Copy()
current := randomSate(key)
current.State = eval.Error
current.Labels["datasource_uid"] = util.GenerateShortUID()
current.Labels["ref_id"] = util.GenerateShortUID()
result := eval.Result{
Instance: ngmodels.GenerateAlertLabels(5, "result-"),
State: eval.Error,
}
expectedLabels := orig.Labels.Copy()
expectedLabels["datasource_uid"] = current.Labels["datasource_uid"]
expectedLabels["ref_id"] = current.Labels["ref_id"]
patch(&state, &current, result)
assert.Equal(t, expectedLabels, state.Labels)
assert.Equal(t, current.State, state.State)
assert.Equal(t, current.StateReason, state.StateReason)
assert.Equal(t, current.Image, state.Image)
assert.Equal(t, current.LatestResult, state.LatestResult)
assert.Equal(t, current.Error, state.Error)
assert.Equal(t, current.Values, state.Values)
assert.Equal(t, current.StartsAt, state.StartsAt)
assert.Equal(t, current.EndsAt, state.EndsAt)
assert.Equal(t, current.ResolvedAt, state.ResolvedAt)
assert.Equal(t, current.LastSentAt, state.LastSentAt)
assert.Equal(t, current.LastEvaluationString, state.LastEvaluationString)
// Fields that should not change
assert.Equal(t, orig.OrgID, state.OrgID)
assert.Equal(t, orig.AlertRuleUID, state.AlertRuleUID)
assert.Equal(t, orig.CacheID, state.CacheID)
assert.Equal(t, orig.ResultFingerprint, state.ResultFingerprint)
assert.Equal(t, orig.LastEvaluationTime, state.LastEvaluationTime)
assert.Equal(t, orig.EvaluationDuration, state.EvaluationDuration)
assert.EqualValues(t, orig.Annotations, state.Annotations)
})
}
func TestResultStateReason(t *testing.T) {