alerting: add lock on job to prevent a race condition (#18218)

without this lock there is a race condition between the scheduler and job processing.
2019-09-03 09:14:28 -04:00 · 2019-09-03 09:14:28 -04:00 · 364d2358d8
parent e3181e66b4
commit 364d2358d8
5 changed files with 32 additions and 14 deletions
--- a/pkg/services/alerting/engine.go
+++ b/pkg/services/alerting/engine.go
@ -117,7 +117,7 @@ func (e *AlertEngine) processJobWithRetry(grafanaCtx context.Context, job *Job)

 	// Initialize with first attemptID=1
 	attemptChan <- 1
-	job.Running = true
+	job.SetRunning(true)

 	for {
 		select {
@ -141,7 +141,7 @@ func (e *AlertEngine) processJobWithRetry(grafanaCtx context.Context, job *Job)
 }

 func (e *AlertEngine) endJob(err error, cancelChan chan context.CancelFunc, job *Job) error {
-	job.Running = false
+	job.SetRunning(false)
 	close(cancelChan)
 	for cancelFn := range cancelChan {
 		cancelFn()
--- a/pkg/services/alerting/engine_integration_test.go
+++ b/pkg/services/alerting/engine_integration_test.go
@ -22,7 +22,7 @@ func TestEngineTimeouts(t *testing.T) {
 		setting.AlertingNotificationTimeout = 30 * time.Second
 		setting.AlertingMaxAttempts = 3
 		engine.resultHandler = &FakeResultHandler{}
-		job := &Job{Running: true, Rule: &Rule{}}
+		job := &Job{running: true, Rule: &Rule{}}

 		Convey("Should trigger as many retries as needed", func() {
 			Convey("pended alert for datasource -> result handler should be worked", func() {
--- a/pkg/services/alerting/engine_test.go
+++ b/pkg/services/alerting/engine_test.go
@ -45,7 +45,7 @@ func TestEngineProcessJob(t *testing.T) {
 		setting.AlertingNotificationTimeout = 30 * time.Second
 		setting.AlertingMaxAttempts = 3
 		engine.resultHandler = &FakeResultHandler{}
-		job := &Job{Running: true, Rule: &Rule{}}
+		job := &Job{running: true, Rule: &Rule{}}

 		Convey("Should trigger retry if needed", func() {

--- a/pkg/services/alerting/models.go
+++ b/pkg/services/alerting/models.go
@ -1,14 +1,33 @@
 package alerting

-import "github.com/grafana/grafana/pkg/components/null"
+import (
+	"sync"
+
+	"github.com/grafana/grafana/pkg/components/null"
+)

 // Job holds state about when the alert rule should be evaluated.
 type Job struct {
-	Offset     int64
-	OffsetWait bool
-	Delay      bool
-	Running    bool
-	Rule       *Rule
+	Offset      int64
+	OffsetWait  bool
+	Delay       bool
+	running     bool
+	Rule        *Rule
+	runningLock sync.Mutex // Lock for running property which is used in the Scheduler and AlertEngine execution
+}
+
+// GetRunning returns true if the job is running. A lock is taken and released on the Job to ensure atomicity.
+func (j *Job) GetRunning() bool {
+	defer j.runningLock.Unlock()
+	j.runningLock.Lock()
+	return j.running
+}
+
+// SetRunning sets the running property on the Job. A lock is taken and released on the Job to ensure atomicity.
+func (j *Job) SetRunning(b bool) {
+	j.runningLock.Lock()
+	j.running = b
+	j.runningLock.Unlock()
 }

 // ResultLogEntry represents log data for the alert evaluation.
--- a/pkg/services/alerting/scheduler.go
+++ b/pkg/services/alerting/scheduler.go
@ -30,9 +30,8 @@ func (s *schedulerImpl) Update(rules []*Rule) {
 		if s.jobs[rule.ID] != nil {
 			job = s.jobs[rule.ID]
 		} else {
-			job = &Job{
-				Running: false,
-			}
+			job = &Job{}
+			job.SetRunning(false)
 		}

 		job.Rule = rule
@ -52,7 +51,7 @@ func (s *schedulerImpl) Tick(tickTime time.Time, execQueue chan *Job) {
 	now := tickTime.Unix()

 	for _, job := range s.jobs {
-		if job.Running || job.Rule.State == models.AlertStatePaused {
+		if job.GetRunning() || job.Rule.State == models.AlertStatePaused {
 			continue
 		}