| 
									
										
										
										
											2021-05-13 22:05:33 +08:00
										 |  |  | package store_test | 
					
						
							| 
									
										
										
										
											2021-03-09 04:19:21 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | import ( | 
					
						
							| 
									
										
										
										
											2025-01-28 01:47:33 +08:00
										 |  |  | 	"bytes" | 
					
						
							| 
									
										
										
										
											2022-02-08 16:52:03 +08:00
										 |  |  | 	"context" | 
					
						
							| 
									
										
										
										
											2022-10-06 14:22:58 +08:00
										 |  |  | 	"fmt" | 
					
						
							| 
									
										
										
										
											2021-03-09 04:19:21 +08:00
										 |  |  | 	"testing" | 
					
						
							| 
									
										
										
										
											2024-01-22 20:07:11 +08:00
										 |  |  | 	"time" | 
					
						
							| 
									
										
										
										
											2022-08-09 22:28:36 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-01-28 01:47:33 +08:00
										 |  |  | 	"github.com/golang/snappy" | 
					
						
							| 
									
										
											  
											
												Alerting: Add jitter support for periodic alert state storage to reduce database load spikes (#111357)
What is this feature?
This PR implements a jitter mechanism for periodic alert state storage to distribute database load over time instead of processing all alert instances simultaneously. When enabled via the state_periodic_save_jitter_enabled configuration option, the system spreads batch write operations across 85% of the save interval window, preventing database load spikes in high-cardinality alerting environments.
Why do we need this feature?
In production environments with high alert cardinality, the current periodic batch storage can cause database performance issues by processing all alert instances simultaneously at fixed intervals. Even when using periodic batch storage to improve performance, concentrating all database operations at a single point in time can overwhelm database resources, especially in resource-constrained environments.
Rather than performing all INSERT operations at once during the periodic save, distributing these operations across the time window until the next save cycle can maintain more stable service operation within limited database resources. This approach prevents resource saturation by spreading the database load over the available time interval, allowing the system to operate more gracefully within existing resource constraints.
For example, with 200,000 alert instances using a 5-minute interval and 4,000 batch size, instead of executing 50 batch operations simultaneously, the jitter mechanism distributes these operations across approximately 4.25 minutes (85% of 5 minutes), with each batch executed roughly every 5.2 seconds.
This PR provides system-level protection against such load spikes by distributing operations across time, reducing peak resource usage while maintaining the benefits of periodic batch storage. The jitter mechanism is particularly valuable in resource-constrained environments where maintaining consistent database performance is more critical than precise timing of state updates.
											
										 
											2025-09-29 17:22:36 +08:00
										 |  |  | 	"github.com/grafana/grafana/pkg/util/testutil" | 
					
						
							| 
									
										
										
										
											2022-08-09 22:28:36 +08:00
										 |  |  | 	"github.com/stretchr/testify/require" | 
					
						
							| 
									
										
										
										
											2025-01-28 01:47:33 +08:00
										 |  |  | 	"google.golang.org/protobuf/proto" | 
					
						
							| 
									
										
										
										
											2021-03-09 04:19:21 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-01-28 01:47:33 +08:00
										 |  |  | 	"github.com/grafana/grafana/pkg/infra/db" | 
					
						
							| 
									
										
										
										
											2023-01-14 07:29:29 +08:00
										 |  |  | 	"github.com/grafana/grafana/pkg/services/featuremgmt" | 
					
						
							| 
									
										
										
										
											2021-03-09 04:19:21 +08:00
										 |  |  | 	"github.com/grafana/grafana/pkg/services/ngalert/models" | 
					
						
							| 
									
										
										
										
											2025-01-28 01:47:33 +08:00
										 |  |  | 	pb "github.com/grafana/grafana/pkg/services/ngalert/store/proto/v1" | 
					
						
							| 
									
										
										
										
											2021-05-13 22:05:33 +08:00
										 |  |  | 	"github.com/grafana/grafana/pkg/services/ngalert/tests" | 
					
						
							| 
									
										
										
										
											2023-01-14 07:29:29 +08:00
										 |  |  | 	"github.com/grafana/grafana/pkg/util" | 
					
						
							| 
									
										
										
										
											2021-03-09 04:19:21 +08:00
										 |  |  | ) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-05-01 01:21:57 +08:00
										 |  |  | const baseIntervalSeconds = 10 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-01-28 01:47:33 +08:00
										 |  |  | func TestIntegration_CompressedAlertRuleStateOperations(t *testing.T) { | 
					
						
							| 
									
										
										
										
											2025-09-08 21:49:49 +08:00
										 |  |  | 	testutil.SkipIntegrationTestInShortMode(t) | 
					
						
							| 
									
										
										
										
											2025-01-28 01:47:33 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-10-06 14:22:58 +08:00
										 |  |  | 	ctx := context.Background() | 
					
						
							| 
									
										
										
										
											2025-01-28 01:47:33 +08:00
										 |  |  | 	ng, dbstore := tests.SetupTestEnv( | 
					
						
							|  |  |  | 		t, | 
					
						
							|  |  |  | 		baseIntervalSeconds, | 
					
						
							|  |  |  | 		tests.WithFeatureToggles( | 
					
						
							|  |  |  | 			featuremgmt.WithFeatures(featuremgmt.FlagAlertingSaveStateCompressed), | 
					
						
							|  |  |  | 		), | 
					
						
							|  |  |  | 	) | 
					
						
							| 
									
										
										
										
											2022-10-06 14:22:58 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 	const mainOrgID int64 = 1 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-01-28 01:47:33 +08:00
										 |  |  | 	alertRule1 := tests.CreateTestAlertRule(t, ctx, dbstore, 60, mainOrgID) | 
					
						
							|  |  |  | 	orgID := alertRule1.OrgID | 
					
						
							|  |  |  | 	alertRule2 := tests.CreateTestAlertRule(t, ctx, dbstore, 60, mainOrgID) | 
					
						
							|  |  |  | 	require.Equal(t, orgID, alertRule2.OrgID) | 
					
						
							| 
									
										
										
										
											2022-10-06 14:22:58 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-01-28 01:47:33 +08:00
										 |  |  | 	tests := []struct { | 
					
						
							|  |  |  | 		name           string | 
					
						
							|  |  |  | 		setupInstances func() []models.AlertInstance | 
					
						
							|  |  |  | 		listQuery      *models.ListAlertInstancesQuery | 
					
						
							|  |  |  | 		validate       func(t *testing.T, alerts []*models.AlertInstance) | 
					
						
							|  |  |  | 	}{ | 
					
						
							|  |  |  | 		{ | 
					
						
							|  |  |  | 			name: "can save and read alert rule state", | 
					
						
							|  |  |  | 			setupInstances: func() []models.AlertInstance { | 
					
						
							|  |  |  | 				return []models.AlertInstance{ | 
					
						
							|  |  |  | 					createAlertInstance(alertRule1.OrgID, alertRule1.UID, "labelsHash1", string(models.InstanceStateError), models.InstanceStateFiring), | 
					
						
							|  |  |  | 				} | 
					
						
							| 
									
										
										
										
											2022-10-06 14:22:58 +08:00
										 |  |  | 			}, | 
					
						
							| 
									
										
										
										
											2025-01-28 01:47:33 +08:00
										 |  |  | 			listQuery: &models.ListAlertInstancesQuery{ | 
					
						
							|  |  |  | 				RuleOrgID: alertRule1.OrgID, | 
					
						
							|  |  |  | 				RuleUID:   alertRule1.UID, | 
					
						
							|  |  |  | 			}, | 
					
						
							|  |  |  | 			validate: func(t *testing.T, alerts []*models.AlertInstance) { | 
					
						
							|  |  |  | 				require.Len(t, alerts, 1) | 
					
						
							|  |  |  | 				require.Equal(t, "labelsHash1", alerts[0].LabelsHash) | 
					
						
							|  |  |  | 			}, | 
					
						
							|  |  |  | 		}, | 
					
						
							|  |  |  | 		{ | 
					
						
							|  |  |  | 			name: "can save and read alert rule state with multiple instances", | 
					
						
							|  |  |  | 			setupInstances: func() []models.AlertInstance { | 
					
						
							|  |  |  | 				return []models.AlertInstance{ | 
					
						
							|  |  |  | 					createAlertInstance(alertRule1.OrgID, alertRule1.UID, "hash1", "", models.InstanceStateFiring), | 
					
						
							|  |  |  | 					createAlertInstance(alertRule1.OrgID, alertRule1.UID, "hash2", "", models.InstanceStateFiring), | 
					
						
							|  |  |  | 				} | 
					
						
							|  |  |  | 			}, | 
					
						
							|  |  |  | 			listQuery: &models.ListAlertInstancesQuery{ | 
					
						
							|  |  |  | 				RuleOrgID: alertRule1.OrgID, | 
					
						
							|  |  |  | 				RuleUID:   alertRule1.UID, | 
					
						
							|  |  |  | 			}, | 
					
						
							|  |  |  | 			validate: func(t *testing.T, alerts []*models.AlertInstance) { | 
					
						
							|  |  |  | 				require.Len(t, alerts, 2) | 
					
						
							|  |  |  | 				containsHash(t, alerts, "hash1") | 
					
						
							|  |  |  | 				containsHash(t, alerts, "hash2") | 
					
						
							|  |  |  | 			}, | 
					
						
							|  |  |  | 		}, | 
					
						
							| 
									
										
										
										
											2022-10-06 14:22:58 +08:00
										 |  |  | 	} | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-01-28 01:47:33 +08:00
										 |  |  | 	for _, tc := range tests { | 
					
						
							|  |  |  | 		t.Run(tc.name, func(t *testing.T) { | 
					
						
							|  |  |  | 			instances := tc.setupInstances() | 
					
						
							|  |  |  | 			err := ng.InstanceStore.SaveAlertInstancesForRule(ctx, alertRule1.GetKeyWithGroup(), instances) | 
					
						
							|  |  |  | 			require.NoError(t, err) | 
					
						
							|  |  |  | 			alerts, err := ng.InstanceStore.ListAlertInstances(ctx, tc.listQuery) | 
					
						
							|  |  |  | 			require.NoError(t, err) | 
					
						
							|  |  |  | 			tc.validate(t, alerts) | 
					
						
							|  |  |  | 		}) | 
					
						
							| 
									
										
										
										
											2022-10-06 14:22:58 +08:00
										 |  |  | 	} | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-01-28 01:47:33 +08:00
										 |  |  | // containsHash is a helper function to check if an instance with
 | 
					
						
							|  |  |  | // a given labels hash exists in the list of alert instances.
 | 
					
						
							|  |  |  | func containsHash(t *testing.T, instances []*models.AlertInstance, hash string) { | 
					
						
							|  |  |  | 	t.Helper() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	for _, i := range instances { | 
					
						
							|  |  |  | 		if i.LabelsHash == hash { | 
					
						
							|  |  |  | 			return | 
					
						
							| 
									
										
										
										
											2023-01-14 07:29:29 +08:00
										 |  |  | 		} | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-01-28 01:47:33 +08:00
										 |  |  | 	require.Fail(t, fmt.Sprintf("%v does not contain an instance with hash %s", instances, hash)) | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func createAlertInstance(orgID int64, ruleUID, labelsHash, reason string, state models.InstanceStateType) models.AlertInstance { | 
					
						
							|  |  |  | 	return models.AlertInstance{ | 
					
						
							|  |  |  | 		AlertInstanceKey: models.AlertInstanceKey{ | 
					
						
							|  |  |  | 			RuleOrgID:  orgID, | 
					
						
							|  |  |  | 			RuleUID:    ruleUID, | 
					
						
							|  |  |  | 			LabelsHash: labelsHash, | 
					
						
							|  |  |  | 		}, | 
					
						
							|  |  |  | 		CurrentState:  state, | 
					
						
							|  |  |  | 		CurrentReason: reason, | 
					
						
							|  |  |  | 		Labels:        models.InstanceLabels{"label1": "value1"}, | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func TestIntegrationAlertInstanceOperations(t *testing.T) { | 
					
						
							| 
									
										
										
										
											2025-09-08 21:49:49 +08:00
										 |  |  | 	testutil.SkipIntegrationTestInShortMode(t) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-01-28 01:47:33 +08:00
										 |  |  | 	ctx := context.Background() | 
					
						
							|  |  |  | 	ng, dbstore := tests.SetupTestEnv(t, baseIntervalSeconds) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	const mainOrgID int64 = 1 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-02-08 16:52:03 +08:00
										 |  |  | 	alertRule1 := tests.CreateTestAlertRule(t, ctx, dbstore, 60, mainOrgID) | 
					
						
							| 
									
										
										
										
											2021-05-01 01:21:57 +08:00
										 |  |  | 	orgID := alertRule1.OrgID | 
					
						
							| 
									
										
										
										
											2021-03-09 04:19:21 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-02-08 16:52:03 +08:00
										 |  |  | 	alertRule2 := tests.CreateTestAlertRule(t, ctx, dbstore, 60, mainOrgID) | 
					
						
							| 
									
										
										
										
											2021-05-01 01:21:57 +08:00
										 |  |  | 	require.Equal(t, orgID, alertRule2.OrgID) | 
					
						
							| 
									
										
										
										
											2021-03-09 04:19:21 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-02-08 16:52:03 +08:00
										 |  |  | 	alertRule3 := tests.CreateTestAlertRule(t, ctx, dbstore, 60, mainOrgID) | 
					
						
							| 
									
										
										
										
											2021-05-01 01:21:57 +08:00
										 |  |  | 	require.Equal(t, orgID, alertRule3.OrgID) | 
					
						
							| 
									
										
										
										
											2021-03-09 04:19:21 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-02-08 16:52:03 +08:00
										 |  |  | 	alertRule4 := tests.CreateTestAlertRule(t, ctx, dbstore, 60, mainOrgID) | 
					
						
							| 
									
										
										
										
											2021-05-01 01:21:57 +08:00
										 |  |  | 	require.Equal(t, orgID, alertRule4.OrgID) | 
					
						
							| 
									
										
										
										
											2021-03-09 04:19:21 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 	t.Run("can save and read new alert instance", func(t *testing.T) { | 
					
						
							| 
									
										
										
										
											2022-10-06 14:22:58 +08:00
										 |  |  | 		labels := models.InstanceLabels{"test": "testValue"} | 
					
						
							|  |  |  | 		_, hash, _ := labels.StringAndHash() | 
					
						
							|  |  |  | 		instance := models.AlertInstance{ | 
					
						
							|  |  |  | 			AlertInstanceKey: models.AlertInstanceKey{ | 
					
						
							|  |  |  | 				RuleOrgID:  alertRule1.OrgID, | 
					
						
							|  |  |  | 				RuleUID:    alertRule1.UID, | 
					
						
							|  |  |  | 				LabelsHash: hash, | 
					
						
							|  |  |  | 			}, | 
					
						
							|  |  |  | 			CurrentState:  models.InstanceStateFiring, | 
					
						
							|  |  |  | 			CurrentReason: string(models.InstanceStateError), | 
					
						
							|  |  |  | 			Labels:        labels, | 
					
						
							|  |  |  | 		} | 
					
						
							| 
									
										
										
										
											2025-01-28 01:47:33 +08:00
										 |  |  | 		err := ng.InstanceStore.SaveAlertInstance(ctx, instance) | 
					
						
							| 
									
										
										
										
											2021-03-09 04:19:21 +08:00
										 |  |  | 		require.NoError(t, err) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-09-27 03:38:53 +08:00
										 |  |  | 		listCmd := &models.ListAlertInstancesQuery{ | 
					
						
							| 
									
										
										
										
											2022-10-06 14:22:58 +08:00
										 |  |  | 			RuleOrgID: instance.RuleOrgID, | 
					
						
							|  |  |  | 			RuleUID:   instance.RuleUID, | 
					
						
							| 
									
										
										
										
											2021-03-09 04:19:21 +08:00
										 |  |  | 		} | 
					
						
							| 
									
										
										
										
											2025-01-28 01:47:33 +08:00
										 |  |  | 		alerts, err := ng.InstanceStore.ListAlertInstances(ctx, listCmd) | 
					
						
							| 
									
										
										
										
											2021-03-09 04:19:21 +08:00
										 |  |  | 		require.NoError(t, err) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-03-28 16:34:35 +08:00
										 |  |  | 		require.Len(t, alerts, 1) | 
					
						
							|  |  |  | 		require.Equal(t, instance.Labels, alerts[0].Labels) | 
					
						
							|  |  |  | 		require.Equal(t, alertRule1.OrgID, alerts[0].RuleOrgID) | 
					
						
							|  |  |  | 		require.Equal(t, alertRule1.UID, alerts[0].RuleUID) | 
					
						
							|  |  |  | 		require.Equal(t, instance.CurrentReason, alerts[0].CurrentReason) | 
					
						
							| 
									
										
										
										
											2021-03-09 04:19:21 +08:00
										 |  |  | 	}) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	t.Run("can save and read new alert instance with no labels", func(t *testing.T) { | 
					
						
							| 
									
										
										
										
											2022-10-06 14:22:58 +08:00
										 |  |  | 		labels := models.InstanceLabels{} | 
					
						
							|  |  |  | 		_, hash, _ := labels.StringAndHash() | 
					
						
							|  |  |  | 		instance := models.AlertInstance{ | 
					
						
							|  |  |  | 			AlertInstanceKey: models.AlertInstanceKey{ | 
					
						
							|  |  |  | 				RuleOrgID:  alertRule2.OrgID, | 
					
						
							|  |  |  | 				RuleUID:    alertRule2.UID, | 
					
						
							|  |  |  | 				LabelsHash: hash, | 
					
						
							|  |  |  | 			}, | 
					
						
							|  |  |  | 			CurrentState: models.InstanceStateNormal, | 
					
						
							|  |  |  | 			Labels:       labels, | 
					
						
							| 
									
										
										
										
											2021-03-09 04:19:21 +08:00
										 |  |  | 		} | 
					
						
							| 
									
										
										
										
											2025-01-28 01:47:33 +08:00
										 |  |  | 		err := ng.InstanceStore.SaveAlertInstance(ctx, instance) | 
					
						
							| 
									
										
										
										
											2021-03-09 04:19:21 +08:00
										 |  |  | 		require.NoError(t, err) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-09-27 03:38:53 +08:00
										 |  |  | 		listCmd := &models.ListAlertInstancesQuery{ | 
					
						
							| 
									
										
										
										
											2022-10-06 14:22:58 +08:00
										 |  |  | 			RuleOrgID: instance.RuleOrgID, | 
					
						
							|  |  |  | 			RuleUID:   instance.RuleUID, | 
					
						
							| 
									
										
										
										
											2021-03-09 04:19:21 +08:00
										 |  |  | 		} | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-01-28 01:47:33 +08:00
										 |  |  | 		alerts, err := ng.InstanceStore.ListAlertInstances(ctx, listCmd) | 
					
						
							| 
									
										
										
										
											2021-03-09 04:19:21 +08:00
										 |  |  | 		require.NoError(t, err) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-03-28 16:34:35 +08:00
										 |  |  | 		require.Len(t, alerts, 1) | 
					
						
							|  |  |  | 		require.Equal(t, alertRule2.OrgID, alerts[0].RuleOrgID) | 
					
						
							|  |  |  | 		require.Equal(t, alertRule2.UID, alerts[0].RuleUID) | 
					
						
							|  |  |  | 		require.Equal(t, instance.Labels, alerts[0].Labels) | 
					
						
							| 
									
										
										
										
											2021-03-09 04:19:21 +08:00
										 |  |  | 	}) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	t.Run("can save two instances with same org_id, uid and different labels", func(t *testing.T) { | 
					
						
							| 
									
										
										
										
											2022-10-06 14:22:58 +08:00
										 |  |  | 		labels := models.InstanceLabels{"test": "testValue"} | 
					
						
							|  |  |  | 		_, hash, _ := labels.StringAndHash() | 
					
						
							|  |  |  | 		instance1 := models.AlertInstance{ | 
					
						
							|  |  |  | 			AlertInstanceKey: models.AlertInstanceKey{ | 
					
						
							|  |  |  | 				RuleOrgID:  alertRule3.OrgID, | 
					
						
							|  |  |  | 				RuleUID:    alertRule3.UID, | 
					
						
							|  |  |  | 				LabelsHash: hash, | 
					
						
							|  |  |  | 			}, | 
					
						
							|  |  |  | 			CurrentState: models.InstanceStateFiring, | 
					
						
							|  |  |  | 			Labels:       labels, | 
					
						
							| 
									
										
										
										
											2021-03-09 04:19:21 +08:00
										 |  |  | 		} | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-01-28 01:47:33 +08:00
										 |  |  | 		err := ng.InstanceStore.SaveAlertInstance(ctx, instance1) | 
					
						
							| 
									
										
										
										
											2021-03-09 04:19:21 +08:00
										 |  |  | 		require.NoError(t, err) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-10-06 14:22:58 +08:00
										 |  |  | 		labels = models.InstanceLabels{"test": "testValue2"} | 
					
						
							|  |  |  | 		_, hash, _ = labels.StringAndHash() | 
					
						
							|  |  |  | 		instance2 := models.AlertInstance{ | 
					
						
							|  |  |  | 			AlertInstanceKey: models.AlertInstanceKey{ | 
					
						
							|  |  |  | 				RuleOrgID:  instance1.RuleOrgID, | 
					
						
							|  |  |  | 				RuleUID:    instance1.RuleUID, | 
					
						
							|  |  |  | 				LabelsHash: hash, | 
					
						
							|  |  |  | 			}, | 
					
						
							|  |  |  | 			CurrentState: models.InstanceStateFiring, | 
					
						
							|  |  |  | 			Labels:       labels, | 
					
						
							| 
									
										
										
										
											2021-03-09 04:19:21 +08:00
										 |  |  | 		} | 
					
						
							| 
									
										
										
										
											2025-01-28 01:47:33 +08:00
										 |  |  | 		err = ng.InstanceStore.SaveAlertInstance(ctx, instance2) | 
					
						
							| 
									
										
										
										
											2021-03-09 04:19:21 +08:00
										 |  |  | 		require.NoError(t, err) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 		listQuery := &models.ListAlertInstancesQuery{ | 
					
						
							| 
									
										
										
										
											2022-10-06 14:22:58 +08:00
										 |  |  | 			RuleOrgID: instance1.RuleOrgID, | 
					
						
							|  |  |  | 			RuleUID:   instance1.RuleUID, | 
					
						
							| 
									
										
										
										
											2021-03-09 04:19:21 +08:00
										 |  |  | 		} | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-01-28 01:47:33 +08:00
										 |  |  | 		alerts, err := ng.InstanceStore.ListAlertInstances(ctx, listQuery) | 
					
						
							| 
									
										
										
										
											2021-03-09 04:19:21 +08:00
										 |  |  | 		require.NoError(t, err) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-03-28 16:34:35 +08:00
										 |  |  | 		require.Len(t, alerts, 2) | 
					
						
							| 
									
										
										
										
											2021-03-09 04:19:21 +08:00
										 |  |  | 	}) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	t.Run("can list all added instances in org", func(t *testing.T) { | 
					
						
							|  |  |  | 		listQuery := &models.ListAlertInstancesQuery{ | 
					
						
							| 
									
										
										
										
											2021-05-03 19:19:15 +08:00
										 |  |  | 			RuleOrgID: orgID, | 
					
						
							| 
									
										
										
										
											2021-03-09 04:19:21 +08:00
										 |  |  | 		} | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-01-28 01:47:33 +08:00
										 |  |  | 		alerts, err := ng.InstanceStore.ListAlertInstances(ctx, listQuery) | 
					
						
							| 
									
										
										
										
											2021-03-09 04:19:21 +08:00
										 |  |  | 		require.NoError(t, err) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-03-28 16:34:35 +08:00
										 |  |  | 		require.Len(t, alerts, 4) | 
					
						
							| 
									
										
										
										
											2021-03-09 04:19:21 +08:00
										 |  |  | 	}) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-10-06 14:22:58 +08:00
										 |  |  | 	t.Run("update instance with same org_id, uid and different state", func(t *testing.T) { | 
					
						
							|  |  |  | 		labels := models.InstanceLabels{"test": "testValue"} | 
					
						
							|  |  |  | 		_, hash, _ := labels.StringAndHash() | 
					
						
							|  |  |  | 		instance1 := models.AlertInstance{ | 
					
						
							|  |  |  | 			AlertInstanceKey: models.AlertInstanceKey{ | 
					
						
							|  |  |  | 				RuleOrgID:  alertRule4.OrgID, | 
					
						
							|  |  |  | 				RuleUID:    alertRule4.UID, | 
					
						
							|  |  |  | 				LabelsHash: hash, | 
					
						
							|  |  |  | 			}, | 
					
						
							|  |  |  | 			CurrentState: models.InstanceStateFiring, | 
					
						
							|  |  |  | 			Labels:       labels, | 
					
						
							| 
									
										
										
										
											2021-03-09 04:19:21 +08:00
										 |  |  | 		} | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-01-28 01:47:33 +08:00
										 |  |  | 		err := ng.InstanceStore.SaveAlertInstance(ctx, instance1) | 
					
						
							| 
									
										
										
										
											2021-03-09 04:19:21 +08:00
										 |  |  | 		require.NoError(t, err) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-10-06 14:22:58 +08:00
										 |  |  | 		instance2 := models.AlertInstance{ | 
					
						
							|  |  |  | 			AlertInstanceKey: models.AlertInstanceKey{ | 
					
						
							|  |  |  | 				RuleOrgID:  alertRule4.OrgID, | 
					
						
							|  |  |  | 				RuleUID:    instance1.RuleUID, | 
					
						
							|  |  |  | 				LabelsHash: instance1.LabelsHash, | 
					
						
							|  |  |  | 			}, | 
					
						
							|  |  |  | 			CurrentState: models.InstanceStateNormal, | 
					
						
							|  |  |  | 			Labels:       instance1.Labels, | 
					
						
							| 
									
										
										
										
											2021-03-09 04:19:21 +08:00
										 |  |  | 		} | 
					
						
							| 
									
										
										
										
											2025-01-28 01:47:33 +08:00
										 |  |  | 		err = ng.InstanceStore.SaveAlertInstance(ctx, instance2) | 
					
						
							| 
									
										
										
										
											2021-03-09 04:19:21 +08:00
										 |  |  | 		require.NoError(t, err) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 		listQuery := &models.ListAlertInstancesQuery{ | 
					
						
							| 
									
										
										
										
											2021-05-03 19:19:15 +08:00
										 |  |  | 			RuleOrgID: alertRule4.OrgID, | 
					
						
							|  |  |  | 			RuleUID:   alertRule4.UID, | 
					
						
							| 
									
										
										
										
											2021-03-09 04:19:21 +08:00
										 |  |  | 		} | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-01-28 01:47:33 +08:00
										 |  |  | 		alerts, err := ng.InstanceStore.ListAlertInstances(ctx, listQuery) | 
					
						
							| 
									
										
										
										
											2021-03-09 04:19:21 +08:00
										 |  |  | 		require.NoError(t, err) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-03-28 16:34:35 +08:00
										 |  |  | 		require.Len(t, alerts, 1) | 
					
						
							| 
									
										
										
										
											2021-03-09 04:19:21 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-03-28 16:34:35 +08:00
										 |  |  | 		require.Equal(t, instance2.RuleOrgID, alerts[0].RuleOrgID) | 
					
						
							|  |  |  | 		require.Equal(t, instance2.RuleUID, alerts[0].RuleUID) | 
					
						
							|  |  |  | 		require.Equal(t, instance2.Labels, alerts[0].Labels) | 
					
						
							|  |  |  | 		require.Equal(t, instance2.CurrentState, alerts[0].CurrentState) | 
					
						
							| 
									
										
										
										
											2021-03-09 04:19:21 +08:00
										 |  |  | 	}) | 
					
						
							|  |  |  | } | 
					
						
							| 
									
										
										
										
											2024-01-22 20:07:11 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | func TestIntegrationFullSync(t *testing.T) { | 
					
						
							| 
									
										
										
										
											2025-09-08 21:49:49 +08:00
										 |  |  | 	testutil.SkipIntegrationTestInShortMode(t) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-12-16 22:30:38 +08:00
										 |  |  | 	batchSize := 1 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-01-22 20:07:11 +08:00
										 |  |  | 	ctx := context.Background() | 
					
						
							| 
									
										
										
										
											2025-01-28 01:47:33 +08:00
										 |  |  | 	ng, _ := tests.SetupTestEnv(t, baseIntervalSeconds) | 
					
						
							| 
									
										
										
										
											2024-01-22 20:07:11 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 	orgID := int64(1) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	ruleUIDs := []string{"a", "b", "c", "d"} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	instances := make([]models.AlertInstance, len(ruleUIDs)) | 
					
						
							|  |  |  | 	for i, ruleUID := range ruleUIDs { | 
					
						
							|  |  |  | 		instances[i] = generateTestAlertInstance(orgID, ruleUID) | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	t.Run("Should do a proper full sync", func(t *testing.T) { | 
					
						
							| 
									
										
											  
											
												Alerting: Add jitter support for periodic alert state storage to reduce database load spikes (#111357)
What is this feature?
This PR implements a jitter mechanism for periodic alert state storage to distribute database load over time instead of processing all alert instances simultaneously. When enabled via the state_periodic_save_jitter_enabled configuration option, the system spreads batch write operations across 85% of the save interval window, preventing database load spikes in high-cardinality alerting environments.
Why do we need this feature?
In production environments with high alert cardinality, the current periodic batch storage can cause database performance issues by processing all alert instances simultaneously at fixed intervals. Even when using periodic batch storage to improve performance, concentrating all database operations at a single point in time can overwhelm database resources, especially in resource-constrained environments.
Rather than performing all INSERT operations at once during the periodic save, distributing these operations across the time window until the next save cycle can maintain more stable service operation within limited database resources. This approach prevents resource saturation by spreading the database load over the available time interval, allowing the system to operate more gracefully within existing resource constraints.
For example, with 200,000 alert instances using a 5-minute interval and 4,000 batch size, instead of executing 50 batch operations simultaneously, the jitter mechanism distributes these operations across approximately 4.25 minutes (85% of 5 minutes), with each batch executed roughly every 5.2 seconds.
This PR provides system-level protection against such load spikes by distributing operations across time, reducing peak resource usage while maintaining the benefits of periodic batch storage. The jitter mechanism is particularly valuable in resource-constrained environments where maintaining consistent database performance is more critical than precise timing of state updates.
											
										 
											2025-09-29 17:22:36 +08:00
										 |  |  | 		err := ng.InstanceStore.FullSync(ctx, instances, batchSize, nil) | 
					
						
							| 
									
										
										
										
											2024-01-22 20:07:11 +08:00
										 |  |  | 		require.NoError(t, err) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-01-28 01:47:33 +08:00
										 |  |  | 		res, err := ng.InstanceStore.ListAlertInstances(ctx, &models.ListAlertInstancesQuery{ | 
					
						
							| 
									
										
										
										
											2024-01-22 20:07:11 +08:00
										 |  |  | 			RuleOrgID: orgID, | 
					
						
							|  |  |  | 		}) | 
					
						
							|  |  |  | 		require.NoError(t, err) | 
					
						
							|  |  |  | 		require.Len(t, res, len(instances)) | 
					
						
							|  |  |  | 		for _, ruleUID := range ruleUIDs { | 
					
						
							|  |  |  | 			found := false | 
					
						
							|  |  |  | 			for _, instance := range res { | 
					
						
							|  |  |  | 				if instance.RuleUID == ruleUID { | 
					
						
							|  |  |  | 					found = true | 
					
						
							|  |  |  | 					continue | 
					
						
							|  |  |  | 				} | 
					
						
							|  |  |  | 			} | 
					
						
							|  |  |  | 			if !found { | 
					
						
							|  |  |  | 				t.Errorf("Instance with RuleUID '%s' not found", ruleUID) | 
					
						
							|  |  |  | 			} | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 	}) | 
					
						
							| 
									
										
										
										
											2024-12-16 22:30:38 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-01-22 20:07:11 +08:00
										 |  |  | 	t.Run("Should remove non existing entries on sync", func(t *testing.T) { | 
					
						
							| 
									
										
											  
											
												Alerting: Add jitter support for periodic alert state storage to reduce database load spikes (#111357)
What is this feature?
This PR implements a jitter mechanism for periodic alert state storage to distribute database load over time instead of processing all alert instances simultaneously. When enabled via the state_periodic_save_jitter_enabled configuration option, the system spreads batch write operations across 85% of the save interval window, preventing database load spikes in high-cardinality alerting environments.
Why do we need this feature?
In production environments with high alert cardinality, the current periodic batch storage can cause database performance issues by processing all alert instances simultaneously at fixed intervals. Even when using periodic batch storage to improve performance, concentrating all database operations at a single point in time can overwhelm database resources, especially in resource-constrained environments.
Rather than performing all INSERT operations at once during the periodic save, distributing these operations across the time window until the next save cycle can maintain more stable service operation within limited database resources. This approach prevents resource saturation by spreading the database load over the available time interval, allowing the system to operate more gracefully within existing resource constraints.
For example, with 200,000 alert instances using a 5-minute interval and 4,000 batch size, instead of executing 50 batch operations simultaneously, the jitter mechanism distributes these operations across approximately 4.25 minutes (85% of 5 minutes), with each batch executed roughly every 5.2 seconds.
This PR provides system-level protection against such load spikes by distributing operations across time, reducing peak resource usage while maintaining the benefits of periodic batch storage. The jitter mechanism is particularly valuable in resource-constrained environments where maintaining consistent database performance is more critical than precise timing of state updates.
											
										 
											2025-09-29 17:22:36 +08:00
										 |  |  | 		err := ng.InstanceStore.FullSync(ctx, instances[1:], batchSize, nil) | 
					
						
							| 
									
										
										
										
											2024-01-22 20:07:11 +08:00
										 |  |  | 		require.NoError(t, err) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-01-28 01:47:33 +08:00
										 |  |  | 		res, err := ng.InstanceStore.ListAlertInstances(ctx, &models.ListAlertInstancesQuery{ | 
					
						
							| 
									
										
										
										
											2024-01-22 20:07:11 +08:00
										 |  |  | 			RuleOrgID: orgID, | 
					
						
							|  |  |  | 		}) | 
					
						
							|  |  |  | 		require.NoError(t, err) | 
					
						
							|  |  |  | 		require.Len(t, res, len(instances)-1) | 
					
						
							|  |  |  | 		for _, instance := range res { | 
					
						
							|  |  |  | 			if instance.RuleUID == "a" { | 
					
						
							|  |  |  | 				t.Error("Instance with RuleUID 'a' should not be exist anymore") | 
					
						
							|  |  |  | 			} | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 	}) | 
					
						
							| 
									
										
										
										
											2024-12-16 22:30:38 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-01-22 20:07:11 +08:00
										 |  |  | 	t.Run("Should add new entries on sync", func(t *testing.T) { | 
					
						
							|  |  |  | 		newRuleUID := "y" | 
					
						
							| 
									
										
											  
											
												Alerting: Add jitter support for periodic alert state storage to reduce database load spikes (#111357)
What is this feature?
This PR implements a jitter mechanism for periodic alert state storage to distribute database load over time instead of processing all alert instances simultaneously. When enabled via the state_periodic_save_jitter_enabled configuration option, the system spreads batch write operations across 85% of the save interval window, preventing database load spikes in high-cardinality alerting environments.
Why do we need this feature?
In production environments with high alert cardinality, the current periodic batch storage can cause database performance issues by processing all alert instances simultaneously at fixed intervals. Even when using periodic batch storage to improve performance, concentrating all database operations at a single point in time can overwhelm database resources, especially in resource-constrained environments.
Rather than performing all INSERT operations at once during the periodic save, distributing these operations across the time window until the next save cycle can maintain more stable service operation within limited database resources. This approach prevents resource saturation by spreading the database load over the available time interval, allowing the system to operate more gracefully within existing resource constraints.
For example, with 200,000 alert instances using a 5-minute interval and 4,000 batch size, instead of executing 50 batch operations simultaneously, the jitter mechanism distributes these operations across approximately 4.25 minutes (85% of 5 minutes), with each batch executed roughly every 5.2 seconds.
This PR provides system-level protection against such load spikes by distributing operations across time, reducing peak resource usage while maintaining the benefits of periodic batch storage. The jitter mechanism is particularly valuable in resource-constrained environments where maintaining consistent database performance is more critical than precise timing of state updates.
											
										 
											2025-09-29 17:22:36 +08:00
										 |  |  | 		err := ng.InstanceStore.FullSync(ctx, append(instances, generateTestAlertInstance(orgID, newRuleUID)), batchSize, nil) | 
					
						
							| 
									
										
										
										
											2024-12-16 22:30:38 +08:00
										 |  |  | 		require.NoError(t, err) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-01-28 01:47:33 +08:00
										 |  |  | 		res, err := ng.InstanceStore.ListAlertInstances(ctx, &models.ListAlertInstancesQuery{ | 
					
						
							| 
									
										
										
										
											2024-12-16 22:30:38 +08:00
										 |  |  | 			RuleOrgID: orgID, | 
					
						
							|  |  |  | 		}) | 
					
						
							|  |  |  | 		require.NoError(t, err) | 
					
						
							|  |  |  | 		require.Len(t, res, len(instances)+1) | 
					
						
							|  |  |  | 		for _, ruleUID := range append(ruleUIDs, newRuleUID) { | 
					
						
							|  |  |  | 			found := false | 
					
						
							|  |  |  | 			for _, instance := range res { | 
					
						
							|  |  |  | 				if instance.RuleUID == ruleUID { | 
					
						
							|  |  |  | 					found = true | 
					
						
							|  |  |  | 					continue | 
					
						
							|  |  |  | 				} | 
					
						
							|  |  |  | 			} | 
					
						
							|  |  |  | 			if !found { | 
					
						
							|  |  |  | 				t.Errorf("Instance with RuleUID '%s' not found", ruleUID) | 
					
						
							|  |  |  | 			} | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 	}) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	t.Run("Should save all instances when batch size is bigger than 1", func(t *testing.T) { | 
					
						
							|  |  |  | 		batchSize = 2 | 
					
						
							|  |  |  | 		newRuleUID := "y" | 
					
						
							| 
									
										
											  
											
												Alerting: Add jitter support for periodic alert state storage to reduce database load spikes (#111357)
What is this feature?
This PR implements a jitter mechanism for periodic alert state storage to distribute database load over time instead of processing all alert instances simultaneously. When enabled via the state_periodic_save_jitter_enabled configuration option, the system spreads batch write operations across 85% of the save interval window, preventing database load spikes in high-cardinality alerting environments.
Why do we need this feature?
In production environments with high alert cardinality, the current periodic batch storage can cause database performance issues by processing all alert instances simultaneously at fixed intervals. Even when using periodic batch storage to improve performance, concentrating all database operations at a single point in time can overwhelm database resources, especially in resource-constrained environments.
Rather than performing all INSERT operations at once during the periodic save, distributing these operations across the time window until the next save cycle can maintain more stable service operation within limited database resources. This approach prevents resource saturation by spreading the database load over the available time interval, allowing the system to operate more gracefully within existing resource constraints.
For example, with 200,000 alert instances using a 5-minute interval and 4,000 batch size, instead of executing 50 batch operations simultaneously, the jitter mechanism distributes these operations across approximately 4.25 minutes (85% of 5 minutes), with each batch executed roughly every 5.2 seconds.
This PR provides system-level protection against such load spikes by distributing operations across time, reducing peak resource usage while maintaining the benefits of periodic batch storage. The jitter mechanism is particularly valuable in resource-constrained environments where maintaining consistent database performance is more critical than precise timing of state updates.
											
										 
											2025-09-29 17:22:36 +08:00
										 |  |  | 		err := ng.InstanceStore.FullSync(ctx, append(instances, generateTestAlertInstance(orgID, newRuleUID)), batchSize, nil) | 
					
						
							| 
									
										
										
										
											2024-01-22 20:07:11 +08:00
										 |  |  | 		require.NoError(t, err) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-01-28 01:47:33 +08:00
										 |  |  | 		res, err := ng.InstanceStore.ListAlertInstances(ctx, &models.ListAlertInstancesQuery{ | 
					
						
							| 
									
										
										
										
											2024-01-22 20:07:11 +08:00
										 |  |  | 			RuleOrgID: orgID, | 
					
						
							|  |  |  | 		}) | 
					
						
							|  |  |  | 		require.NoError(t, err) | 
					
						
							|  |  |  | 		require.Len(t, res, len(instances)+1) | 
					
						
							|  |  |  | 		for _, ruleUID := range append(ruleUIDs, newRuleUID) { | 
					
						
							|  |  |  | 			found := false | 
					
						
							|  |  |  | 			for _, instance := range res { | 
					
						
							|  |  |  | 				if instance.RuleUID == ruleUID { | 
					
						
							|  |  |  | 					found = true | 
					
						
							|  |  |  | 					continue | 
					
						
							|  |  |  | 				} | 
					
						
							|  |  |  | 			} | 
					
						
							|  |  |  | 			if !found { | 
					
						
							|  |  |  | 				t.Errorf("Instance with RuleUID '%s' not found", ruleUID) | 
					
						
							|  |  |  | 			} | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 	}) | 
					
						
							| 
									
										
										
										
											2024-12-16 22:30:38 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 	t.Run("Should not fail when the instances are empty", func(t *testing.T) { | 
					
						
							|  |  |  | 		// First, insert some data into the table.
 | 
					
						
							|  |  |  | 		initialInstances := []models.AlertInstance{ | 
					
						
							|  |  |  | 			generateTestAlertInstance(orgID, "preexisting-1"), | 
					
						
							|  |  |  | 			generateTestAlertInstance(orgID, "preexisting-2"), | 
					
						
							|  |  |  | 		} | 
					
						
							| 
									
										
											  
											
												Alerting: Add jitter support for periodic alert state storage to reduce database load spikes (#111357)
What is this feature?
This PR implements a jitter mechanism for periodic alert state storage to distribute database load over time instead of processing all alert instances simultaneously. When enabled via the state_periodic_save_jitter_enabled configuration option, the system spreads batch write operations across 85% of the save interval window, preventing database load spikes in high-cardinality alerting environments.
Why do we need this feature?
In production environments with high alert cardinality, the current periodic batch storage can cause database performance issues by processing all alert instances simultaneously at fixed intervals. Even when using periodic batch storage to improve performance, concentrating all database operations at a single point in time can overwhelm database resources, especially in resource-constrained environments.
Rather than performing all INSERT operations at once during the periodic save, distributing these operations across the time window until the next save cycle can maintain more stable service operation within limited database resources. This approach prevents resource saturation by spreading the database load over the available time interval, allowing the system to operate more gracefully within existing resource constraints.
For example, with 200,000 alert instances using a 5-minute interval and 4,000 batch size, instead of executing 50 batch operations simultaneously, the jitter mechanism distributes these operations across approximately 4.25 minutes (85% of 5 minutes), with each batch executed roughly every 5.2 seconds.
This PR provides system-level protection against such load spikes by distributing operations across time, reducing peak resource usage while maintaining the benefits of periodic batch storage. The jitter mechanism is particularly valuable in resource-constrained environments where maintaining consistent database performance is more critical than precise timing of state updates.
											
										 
											2025-09-29 17:22:36 +08:00
										 |  |  | 		err := ng.InstanceStore.FullSync(ctx, initialInstances, 5, nil) | 
					
						
							| 
									
										
										
										
											2024-12-16 22:30:38 +08:00
										 |  |  | 		require.NoError(t, err) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 		// Now call FullSync with no instances. According to the code, this should return nil
 | 
					
						
							|  |  |  | 		// and should not delete anything in the table.
 | 
					
						
							| 
									
										
											  
											
												Alerting: Add jitter support for periodic alert state storage to reduce database load spikes (#111357)
What is this feature?
This PR implements a jitter mechanism for periodic alert state storage to distribute database load over time instead of processing all alert instances simultaneously. When enabled via the state_periodic_save_jitter_enabled configuration option, the system spreads batch write operations across 85% of the save interval window, preventing database load spikes in high-cardinality alerting environments.
Why do we need this feature?
In production environments with high alert cardinality, the current periodic batch storage can cause database performance issues by processing all alert instances simultaneously at fixed intervals. Even when using periodic batch storage to improve performance, concentrating all database operations at a single point in time can overwhelm database resources, especially in resource-constrained environments.
Rather than performing all INSERT operations at once during the periodic save, distributing these operations across the time window until the next save cycle can maintain more stable service operation within limited database resources. This approach prevents resource saturation by spreading the database load over the available time interval, allowing the system to operate more gracefully within existing resource constraints.
For example, with 200,000 alert instances using a 5-minute interval and 4,000 batch size, instead of executing 50 batch operations simultaneously, the jitter mechanism distributes these operations across approximately 4.25 minutes (85% of 5 minutes), with each batch executed roughly every 5.2 seconds.
This PR provides system-level protection against such load spikes by distributing operations across time, reducing peak resource usage while maintaining the benefits of periodic batch storage. The jitter mechanism is particularly valuable in resource-constrained environments where maintaining consistent database performance is more critical than precise timing of state updates.
											
										 
											2025-09-29 17:22:36 +08:00
										 |  |  | 		err = ng.InstanceStore.FullSync(ctx, []models.AlertInstance{}, 5, nil) | 
					
						
							| 
									
										
										
										
											2024-12-16 22:30:38 +08:00
										 |  |  | 		require.NoError(t, err) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 		// Check that the previously inserted instances are still present.
 | 
					
						
							| 
									
										
										
										
											2025-01-28 01:47:33 +08:00
										 |  |  | 		res, err := ng.InstanceStore.ListAlertInstances(ctx, &models.ListAlertInstancesQuery{ | 
					
						
							| 
									
										
										
										
											2024-12-16 22:30:38 +08:00
										 |  |  | 			RuleOrgID: orgID, | 
					
						
							|  |  |  | 		}) | 
					
						
							|  |  |  | 		require.NoError(t, err) | 
					
						
							|  |  |  | 		require.Len(t, res, 2, "Expected the preexisting instances to remain since empty sync does nothing") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 		found1, found2 := false, false | 
					
						
							|  |  |  | 		for _, r := range res { | 
					
						
							|  |  |  | 			if r.RuleUID == "preexisting-1" { | 
					
						
							|  |  |  | 				found1 = true | 
					
						
							|  |  |  | 			} | 
					
						
							|  |  |  | 			if r.RuleUID == "preexisting-2" { | 
					
						
							|  |  |  | 				found2 = true | 
					
						
							|  |  |  | 			} | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 		require.True(t, found1, "Expected preexisting-1 to remain") | 
					
						
							|  |  |  | 		require.True(t, found2, "Expected preexisting-2 to remain") | 
					
						
							|  |  |  | 	}) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	t.Run("Should handle invalid instances by skipping them", func(t *testing.T) { | 
					
						
							|  |  |  | 		// Create a batch with one valid and one invalid instance
 | 
					
						
							|  |  |  | 		validInstance := generateTestAlertInstance(orgID, "valid") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 		invalidInstance := generateTestAlertInstance(orgID, "") | 
					
						
							|  |  |  | 		// Make the invalid instance actually invalid
 | 
					
						
							| 
									
										
										
										
											2025-04-10 20:42:23 +08:00
										 |  |  | 		invalidInstance.RuleUID = "" | 
					
						
							| 
									
										
										
										
											2024-12-16 22:30:38 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
											  
											
												Alerting: Add jitter support for periodic alert state storage to reduce database load spikes (#111357)
What is this feature?
This PR implements a jitter mechanism for periodic alert state storage to distribute database load over time instead of processing all alert instances simultaneously. When enabled via the state_periodic_save_jitter_enabled configuration option, the system spreads batch write operations across 85% of the save interval window, preventing database load spikes in high-cardinality alerting environments.
Why do we need this feature?
In production environments with high alert cardinality, the current periodic batch storage can cause database performance issues by processing all alert instances simultaneously at fixed intervals. Even when using periodic batch storage to improve performance, concentrating all database operations at a single point in time can overwhelm database resources, especially in resource-constrained environments.
Rather than performing all INSERT operations at once during the periodic save, distributing these operations across the time window until the next save cycle can maintain more stable service operation within limited database resources. This approach prevents resource saturation by spreading the database load over the available time interval, allowing the system to operate more gracefully within existing resource constraints.
For example, with 200,000 alert instances using a 5-minute interval and 4,000 batch size, instead of executing 50 batch operations simultaneously, the jitter mechanism distributes these operations across approximately 4.25 minutes (85% of 5 minutes), with each batch executed roughly every 5.2 seconds.
This PR provides system-level protection against such load spikes by distributing operations across time, reducing peak resource usage while maintaining the benefits of periodic batch storage. The jitter mechanism is particularly valuable in resource-constrained environments where maintaining consistent database performance is more critical than precise timing of state updates.
											
										 
											2025-09-29 17:22:36 +08:00
										 |  |  | 		err := ng.InstanceStore.FullSync(ctx, []models.AlertInstance{validInstance, invalidInstance}, 2, nil) | 
					
						
							| 
									
										
										
										
											2024-12-16 22:30:38 +08:00
										 |  |  | 		require.NoError(t, err) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 		// Only the valid instance should be saved.
 | 
					
						
							| 
									
										
										
										
											2025-01-28 01:47:33 +08:00
										 |  |  | 		res, err := ng.InstanceStore.ListAlertInstances(ctx, &models.ListAlertInstancesQuery{ | 
					
						
							| 
									
										
										
										
											2024-12-16 22:30:38 +08:00
										 |  |  | 			RuleOrgID: orgID, | 
					
						
							|  |  |  | 		}) | 
					
						
							|  |  |  | 		require.NoError(t, err) | 
					
						
							|  |  |  | 		require.Len(t, res, 1) | 
					
						
							|  |  |  | 		require.Equal(t, "valid", res[0].RuleUID) | 
					
						
							|  |  |  | 	}) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	t.Run("Should handle batchSize larger than the number of instances", func(t *testing.T) { | 
					
						
							|  |  |  | 		// Insert a small number of instances but use a large batchSize
 | 
					
						
							|  |  |  | 		smallSet := []models.AlertInstance{ | 
					
						
							|  |  |  | 			generateTestAlertInstance(orgID, "batch-test1"), | 
					
						
							|  |  |  | 			generateTestAlertInstance(orgID, "batch-test2"), | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
											  
											
												Alerting: Add jitter support for periodic alert state storage to reduce database load spikes (#111357)
What is this feature?
This PR implements a jitter mechanism for periodic alert state storage to distribute database load over time instead of processing all alert instances simultaneously. When enabled via the state_periodic_save_jitter_enabled configuration option, the system spreads batch write operations across 85% of the save interval window, preventing database load spikes in high-cardinality alerting environments.
Why do we need this feature?
In production environments with high alert cardinality, the current periodic batch storage can cause database performance issues by processing all alert instances simultaneously at fixed intervals. Even when using periodic batch storage to improve performance, concentrating all database operations at a single point in time can overwhelm database resources, especially in resource-constrained environments.
Rather than performing all INSERT operations at once during the periodic save, distributing these operations across the time window until the next save cycle can maintain more stable service operation within limited database resources. This approach prevents resource saturation by spreading the database load over the available time interval, allowing the system to operate more gracefully within existing resource constraints.
For example, with 200,000 alert instances using a 5-minute interval and 4,000 batch size, instead of executing 50 batch operations simultaneously, the jitter mechanism distributes these operations across approximately 4.25 minutes (85% of 5 minutes), with each batch executed roughly every 5.2 seconds.
This PR provides system-level protection against such load spikes by distributing operations across time, reducing peak resource usage while maintaining the benefits of periodic batch storage. The jitter mechanism is particularly valuable in resource-constrained environments where maintaining consistent database performance is more critical than precise timing of state updates.
											
										 
											2025-09-29 17:22:36 +08:00
										 |  |  | 		err := ng.InstanceStore.FullSync(ctx, smallSet, 100, nil) | 
					
						
							| 
									
										
										
										
											2024-12-16 22:30:38 +08:00
										 |  |  | 		require.NoError(t, err) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-01-28 01:47:33 +08:00
										 |  |  | 		res, err := ng.InstanceStore.ListAlertInstances(ctx, &models.ListAlertInstancesQuery{ | 
					
						
							| 
									
										
										
										
											2024-12-16 22:30:38 +08:00
										 |  |  | 			RuleOrgID: orgID, | 
					
						
							|  |  |  | 		}) | 
					
						
							|  |  |  | 		require.NoError(t, err) | 
					
						
							|  |  |  | 		require.Len(t, res, len(smallSet)) | 
					
						
							|  |  |  | 		found1, found2 := false, false | 
					
						
							|  |  |  | 		for _, r := range res { | 
					
						
							|  |  |  | 			if r.RuleUID == "batch-test1" { | 
					
						
							|  |  |  | 				found1 = true | 
					
						
							|  |  |  | 			} | 
					
						
							|  |  |  | 			if r.RuleUID == "batch-test2" { | 
					
						
							|  |  |  | 				found2 = true | 
					
						
							|  |  |  | 			} | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 		require.True(t, found1) | 
					
						
							|  |  |  | 		require.True(t, found2) | 
					
						
							|  |  |  | 	}) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	t.Run("Should handle a large set of instances with a moderate batchSize", func(t *testing.T) { | 
					
						
							|  |  |  | 		// Clear everything first.
 | 
					
						
							| 
									
										
											  
											
												Alerting: Add jitter support for periodic alert state storage to reduce database load spikes (#111357)
What is this feature?
This PR implements a jitter mechanism for periodic alert state storage to distribute database load over time instead of processing all alert instances simultaneously. When enabled via the state_periodic_save_jitter_enabled configuration option, the system spreads batch write operations across 85% of the save interval window, preventing database load spikes in high-cardinality alerting environments.
Why do we need this feature?
In production environments with high alert cardinality, the current periodic batch storage can cause database performance issues by processing all alert instances simultaneously at fixed intervals. Even when using periodic batch storage to improve performance, concentrating all database operations at a single point in time can overwhelm database resources, especially in resource-constrained environments.
Rather than performing all INSERT operations at once during the periodic save, distributing these operations across the time window until the next save cycle can maintain more stable service operation within limited database resources. This approach prevents resource saturation by spreading the database load over the available time interval, allowing the system to operate more gracefully within existing resource constraints.
For example, with 200,000 alert instances using a 5-minute interval and 4,000 batch size, instead of executing 50 batch operations simultaneously, the jitter mechanism distributes these operations across approximately 4.25 minutes (85% of 5 minutes), with each batch executed roughly every 5.2 seconds.
This PR provides system-level protection against such load spikes by distributing operations across time, reducing peak resource usage while maintaining the benefits of periodic batch storage. The jitter mechanism is particularly valuable in resource-constrained environments where maintaining consistent database performance is more critical than precise timing of state updates.
											
										 
											2025-09-29 17:22:36 +08:00
										 |  |  | 		err := ng.InstanceStore.FullSync(ctx, []models.AlertInstance{}, 1, nil) | 
					
						
							| 
									
										
										
										
											2024-12-16 22:30:38 +08:00
										 |  |  | 		require.NoError(t, err) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 		largeCount := 300 | 
					
						
							|  |  |  | 		largeSet := make([]models.AlertInstance, largeCount) | 
					
						
							|  |  |  | 		for i := 0; i < largeCount; i++ { | 
					
						
							|  |  |  | 			largeSet[i] = generateTestAlertInstance(orgID, fmt.Sprintf("large-%d", i)) | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
											  
											
												Alerting: Add jitter support for periodic alert state storage to reduce database load spikes (#111357)
What is this feature?
This PR implements a jitter mechanism for periodic alert state storage to distribute database load over time instead of processing all alert instances simultaneously. When enabled via the state_periodic_save_jitter_enabled configuration option, the system spreads batch write operations across 85% of the save interval window, preventing database load spikes in high-cardinality alerting environments.
Why do we need this feature?
In production environments with high alert cardinality, the current periodic batch storage can cause database performance issues by processing all alert instances simultaneously at fixed intervals. Even when using periodic batch storage to improve performance, concentrating all database operations at a single point in time can overwhelm database resources, especially in resource-constrained environments.
Rather than performing all INSERT operations at once during the periodic save, distributing these operations across the time window until the next save cycle can maintain more stable service operation within limited database resources. This approach prevents resource saturation by spreading the database load over the available time interval, allowing the system to operate more gracefully within existing resource constraints.
For example, with 200,000 alert instances using a 5-minute interval and 4,000 batch size, instead of executing 50 batch operations simultaneously, the jitter mechanism distributes these operations across approximately 4.25 minutes (85% of 5 minutes), with each batch executed roughly every 5.2 seconds.
This PR provides system-level protection against such load spikes by distributing operations across time, reducing peak resource usage while maintaining the benefits of periodic batch storage. The jitter mechanism is particularly valuable in resource-constrained environments where maintaining consistent database performance is more critical than precise timing of state updates.
											
										 
											2025-09-29 17:22:36 +08:00
										 |  |  | 		err = ng.InstanceStore.FullSync(ctx, largeSet, 50, nil) | 
					
						
							| 
									
										
										
										
											2024-12-16 22:30:38 +08:00
										 |  |  | 		require.NoError(t, err) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-01-28 01:47:33 +08:00
										 |  |  | 		res, err := ng.InstanceStore.ListAlertInstances(ctx, &models.ListAlertInstancesQuery{ | 
					
						
							| 
									
										
										
										
											2024-12-16 22:30:38 +08:00
										 |  |  | 			RuleOrgID: orgID, | 
					
						
							|  |  |  | 		}) | 
					
						
							|  |  |  | 		require.NoError(t, err) | 
					
						
							|  |  |  | 		require.Len(t, res, largeCount) | 
					
						
							|  |  |  | 	}) | 
					
						
							| 
									
										
										
										
											2024-01-22 20:07:11 +08:00
										 |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
											  
											
												Alerting: Add jitter support for periodic alert state storage to reduce database load spikes (#111357)
What is this feature?
This PR implements a jitter mechanism for periodic alert state storage to distribute database load over time instead of processing all alert instances simultaneously. When enabled via the state_periodic_save_jitter_enabled configuration option, the system spreads batch write operations across 85% of the save interval window, preventing database load spikes in high-cardinality alerting environments.
Why do we need this feature?
In production environments with high alert cardinality, the current periodic batch storage can cause database performance issues by processing all alert instances simultaneously at fixed intervals. Even when using periodic batch storage to improve performance, concentrating all database operations at a single point in time can overwhelm database resources, especially in resource-constrained environments.
Rather than performing all INSERT operations at once during the periodic save, distributing these operations across the time window until the next save cycle can maintain more stable service operation within limited database resources. This approach prevents resource saturation by spreading the database load over the available time interval, allowing the system to operate more gracefully within existing resource constraints.
For example, with 200,000 alert instances using a 5-minute interval and 4,000 batch size, instead of executing 50 batch operations simultaneously, the jitter mechanism distributes these operations across approximately 4.25 minutes (85% of 5 minutes), with each batch executed roughly every 5.2 seconds.
This PR provides system-level protection against such load spikes by distributing operations across time, reducing peak resource usage while maintaining the benefits of periodic batch storage. The jitter mechanism is particularly valuable in resource-constrained environments where maintaining consistent database performance is more critical than precise timing of state updates.
											
										 
											2025-09-29 17:22:36 +08:00
										 |  |  | func TestIntegrationFullSyncWithJitter(t *testing.T) { | 
					
						
							|  |  |  | 	testutil.SkipIntegrationTestInShortMode(t) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	batchSize := 2 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	ctx := context.Background() | 
					
						
							|  |  |  | 	ng, _ := tests.SetupTestEnv(t, baseIntervalSeconds) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	orgID := int64(1) | 
					
						
							|  |  |  | 	ruleUIDs := []string{"j1", "j2", "j3", "j4", "j5"} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	instances := make([]models.AlertInstance, len(ruleUIDs)) | 
					
						
							|  |  |  | 	for i, ruleUID := range ruleUIDs { | 
					
						
							|  |  |  | 		instances[i] = generateTestAlertInstance(orgID, ruleUID) | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	// Simple jitter function for testing
 | 
					
						
							|  |  |  | 	jitterFunc := func(batchIndex int) time.Duration { | 
					
						
							|  |  |  | 		return time.Duration(batchIndex*100) * time.Millisecond | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	t.Run("Should do a proper full sync with jitter", func(t *testing.T) { | 
					
						
							|  |  |  | 		err := ng.InstanceStore.FullSync(ctx, instances, batchSize, jitterFunc) | 
					
						
							|  |  |  | 		require.NoError(t, err) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 		res, err := ng.InstanceStore.ListAlertInstances(ctx, &models.ListAlertInstancesQuery{ | 
					
						
							|  |  |  | 			RuleOrgID: orgID, | 
					
						
							|  |  |  | 		}) | 
					
						
							|  |  |  | 		require.NoError(t, err) | 
					
						
							|  |  |  | 		require.Len(t, res, len(instances)) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 		// Verify all instances were saved
 | 
					
						
							|  |  |  | 		for _, ruleUID := range ruleUIDs { | 
					
						
							|  |  |  | 			found := false | 
					
						
							|  |  |  | 			for _, instance := range res { | 
					
						
							|  |  |  | 				if instance.RuleUID == ruleUID { | 
					
						
							|  |  |  | 					found = true | 
					
						
							|  |  |  | 					break | 
					
						
							|  |  |  | 				} | 
					
						
							|  |  |  | 			} | 
					
						
							|  |  |  | 			require.True(t, found, "Instance with RuleUID '%s' not found", ruleUID) | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 	}) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	t.Run("Should handle empty instances with jitter", func(t *testing.T) { | 
					
						
							|  |  |  | 		err := ng.InstanceStore.FullSync(ctx, []models.AlertInstance{}, batchSize, jitterFunc) | 
					
						
							|  |  |  | 		require.NoError(t, err) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 		res, err := ng.InstanceStore.ListAlertInstances(ctx, &models.ListAlertInstancesQuery{ | 
					
						
							|  |  |  | 			RuleOrgID: orgID, | 
					
						
							|  |  |  | 		}) | 
					
						
							|  |  |  | 		require.NoError(t, err) | 
					
						
							|  |  |  | 		require.Len(t, res, len(instances), "Empty sync should not delete existing instances") | 
					
						
							|  |  |  | 	}) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	t.Run("Should handle zero delays (immediate execution)", func(t *testing.T) { | 
					
						
							|  |  |  | 		testInstances := make([]models.AlertInstance, 2) | 
					
						
							|  |  |  | 		for i := 0; i < 2; i++ { | 
					
						
							|  |  |  | 			testInstances[i] = generateTestAlertInstance(orgID, fmt.Sprintf("immediate-%d", i)) | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 		// Function that returns zero delays
 | 
					
						
							|  |  |  | 		immediateJitterFunc := func(batchIndex int) time.Duration { | 
					
						
							|  |  |  | 			return 0 * time.Second | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 		start := time.Now() | 
					
						
							|  |  |  | 		err := ng.InstanceStore.FullSync(ctx, testInstances, 1, immediateJitterFunc) | 
					
						
							|  |  |  | 		elapsed := time.Since(start) | 
					
						
							|  |  |  | 		require.NoError(t, err) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 		// Should complete quickly since all delays are zero
 | 
					
						
							|  |  |  | 		require.Less(t, elapsed, 500*time.Millisecond, "Zero delays should execute immediately") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 		// Verify data was saved
 | 
					
						
							|  |  |  | 		res, err := ng.InstanceStore.ListAlertInstances(ctx, &models.ListAlertInstancesQuery{ | 
					
						
							|  |  |  | 			RuleOrgID: orgID, | 
					
						
							|  |  |  | 		}) | 
					
						
							|  |  |  | 		require.NoError(t, err) | 
					
						
							|  |  |  | 		require.Len(t, res, 2) | 
					
						
							|  |  |  | 	}) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	t.Run("Should execute jitter delays correctly and save data", func(t *testing.T) { | 
					
						
							|  |  |  | 		testInstances := make([]models.AlertInstance, 4) | 
					
						
							|  |  |  | 		for i := 0; i < 4; i++ { | 
					
						
							|  |  |  | 			testInstances[i] = generateTestAlertInstance(orgID, fmt.Sprintf("jitter-test-%d", i)) | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 		// Track jitter function calls
 | 
					
						
							|  |  |  | 		jitterCalls := []int{} | 
					
						
							|  |  |  | 		realJitterFunc := func(batchIndex int) time.Duration { | 
					
						
							|  |  |  | 			jitterCalls = append(jitterCalls, batchIndex) | 
					
						
							|  |  |  | 			return time.Duration(batchIndex*200) * time.Millisecond // 0ms, 200ms delays
 | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 		start := time.Now() | 
					
						
							|  |  |  | 		err := ng.InstanceStore.FullSync(ctx, testInstances, 2, realJitterFunc) | 
					
						
							|  |  |  | 		elapsed := time.Since(start) | 
					
						
							|  |  |  | 		require.NoError(t, err) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 		// Should take at least the maximum delay (200ms for batch 1)
 | 
					
						
							|  |  |  | 		require.GreaterOrEqual(t, elapsed, 200*time.Millisecond, "Should wait for jitter delays") | 
					
						
							|  |  |  | 		require.Less(t, elapsed, 1*time.Second, "Should not take too long") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 		// Verify jitter function was called for each batch
 | 
					
						
							|  |  |  | 		require.Equal(t, []int{0, 1}, jitterCalls, "Should call jitter function for each batch") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 		// Verify all data was saved correctly
 | 
					
						
							|  |  |  | 		res, err := ng.InstanceStore.ListAlertInstances(ctx, &models.ListAlertInstancesQuery{ | 
					
						
							|  |  |  | 			RuleOrgID: orgID, | 
					
						
							|  |  |  | 		}) | 
					
						
							|  |  |  | 		require.NoError(t, err) | 
					
						
							|  |  |  | 		require.Len(t, res, 4) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 		// Verify specific instances were saved
 | 
					
						
							|  |  |  | 		for i := 0; i < 4; i++ { | 
					
						
							|  |  |  | 			expectedUID := fmt.Sprintf("jitter-test-%d", i) | 
					
						
							|  |  |  | 			found := false | 
					
						
							|  |  |  | 			for _, instance := range res { | 
					
						
							|  |  |  | 				if instance.RuleUID == expectedUID { | 
					
						
							|  |  |  | 					found = true | 
					
						
							|  |  |  | 					break | 
					
						
							|  |  |  | 				} | 
					
						
							|  |  |  | 			} | 
					
						
							|  |  |  | 			require.True(t, found, "Instance with RuleUID '%s' not found", expectedUID) | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 	}) | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-01-28 01:47:33 +08:00
										 |  |  | func TestIntegration_ProtoInstanceDBStore_VerifyCompressedData(t *testing.T) { | 
					
						
							| 
									
										
										
										
											2025-09-08 21:49:49 +08:00
										 |  |  | 	testutil.SkipIntegrationTestInShortMode(t) | 
					
						
							| 
									
										
										
										
											2025-01-28 01:47:33 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 	ctx := context.Background() | 
					
						
							|  |  |  | 	ng, dbstore := tests.SetupTestEnv( | 
					
						
							|  |  |  | 		t, | 
					
						
							|  |  |  | 		baseIntervalSeconds, | 
					
						
							|  |  |  | 		tests.WithFeatureToggles( | 
					
						
							|  |  |  | 			featuremgmt.WithFeatures( | 
					
						
							|  |  |  | 				featuremgmt.FlagAlertingSaveStateCompressed, | 
					
						
							|  |  |  | 			), | 
					
						
							|  |  |  | 		), | 
					
						
							|  |  |  | 	) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	alertRule := tests.CreateTestAlertRule(t, ctx, dbstore, 60, 1) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	labelsHash := "hash1" | 
					
						
							|  |  |  | 	reason := "reason" | 
					
						
							|  |  |  | 	state := models.InstanceStateFiring | 
					
						
							|  |  |  | 	instances := []models.AlertInstance{ | 
					
						
							|  |  |  | 		createAlertInstance(alertRule.OrgID, alertRule.UID, labelsHash, reason, state), | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	err := ng.InstanceStore.SaveAlertInstancesForRule(ctx, alertRule.GetKeyWithGroup(), instances) | 
					
						
							|  |  |  | 	require.NoError(t, err) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	// Query raw data from the database
 | 
					
						
							|  |  |  | 	type compressedRow struct { | 
					
						
							|  |  |  | 		OrgID   int64  `xorm:"org_id"` | 
					
						
							|  |  |  | 		RuleUID string `xorm:"rule_uid"` | 
					
						
							|  |  |  | 		Data    []byte `xorm:"data"` | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 	var rawData compressedRow | 
					
						
							|  |  |  | 	err = dbstore.SQLStore.WithDbSession(ctx, func(sess *db.Session) error { | 
					
						
							|  |  |  | 		_, err := sess.SQL("SELECT * FROM alert_rule_state").Get(&rawData) | 
					
						
							|  |  |  | 		return err | 
					
						
							|  |  |  | 	}) | 
					
						
							|  |  |  | 	require.NoError(t, err) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	// Decompress and compare
 | 
					
						
							|  |  |  | 	require.NotNil(t, rawData) | 
					
						
							|  |  |  | 	decompressedInstances, err := decompressAlertInstances(rawData.Data) | 
					
						
							|  |  |  | 	require.NoError(t, err) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	require.Len(t, decompressedInstances, 1) | 
					
						
							|  |  |  | 	require.Equal(t, instances[0].LabelsHash, decompressedInstances[0].LabelsHash) | 
					
						
							|  |  |  | 	require.Equal(t, string(instances[0].CurrentState), decompressedInstances[0].CurrentState) | 
					
						
							|  |  |  | 	require.Equal(t, instances[0].CurrentReason, decompressedInstances[0].CurrentReason) | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func decompressAlertInstances(compressed []byte) ([]*pb.AlertInstance, error) { | 
					
						
							|  |  |  | 	if len(compressed) == 0 { | 
					
						
							|  |  |  | 		return nil, nil | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	reader := snappy.NewReader(bytes.NewReader(compressed)) | 
					
						
							|  |  |  | 	var b bytes.Buffer | 
					
						
							|  |  |  | 	if _, err := b.ReadFrom(reader); err != nil { | 
					
						
							|  |  |  | 		return nil, fmt.Errorf("failed to read compressed data: %w", err) | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	var instances pb.AlertInstances | 
					
						
							|  |  |  | 	if err := proto.Unmarshal(b.Bytes(), &instances); err != nil { | 
					
						
							|  |  |  | 		return nil, fmt.Errorf("failed to unmarshal protobuf: %w", err) | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	return instances.Instances, nil | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-01-22 20:07:11 +08:00
										 |  |  | func generateTestAlertInstance(orgID int64, ruleID string) models.AlertInstance { | 
					
						
							|  |  |  | 	return models.AlertInstance{ | 
					
						
							|  |  |  | 		AlertInstanceKey: models.AlertInstanceKey{ | 
					
						
							|  |  |  | 			RuleOrgID:  orgID, | 
					
						
							|  |  |  | 			RuleUID:    ruleID, | 
					
						
							|  |  |  | 			LabelsHash: "abc", | 
					
						
							|  |  |  | 		}, | 
					
						
							|  |  |  | 		CurrentState: models.InstanceStateFiring, | 
					
						
							|  |  |  | 		Labels: map[string]string{ | 
					
						
							|  |  |  | 			"hello": "world", | 
					
						
							|  |  |  | 		}, | 
					
						
							|  |  |  | 		ResultFingerprint: "abc", | 
					
						
							|  |  |  | 		CurrentStateEnd:   time.Now(), | 
					
						
							|  |  |  | 		CurrentStateSince: time.Now(), | 
					
						
							|  |  |  | 		LastEvalTime:      time.Now(), | 
					
						
							| 
									
										
										
										
											2024-07-13 00:26:58 +08:00
										 |  |  | 		LastSentAt:        util.Pointer(time.Now()), | 
					
						
							| 
									
										
										
										
											2025-05-27 17:04:26 +08:00
										 |  |  | 		FiredAt:           util.Pointer(time.Now()), | 
					
						
							| 
									
										
										
										
											2024-07-13 00:26:58 +08:00
										 |  |  | 		ResolvedAt:        util.Pointer(time.Now()), | 
					
						
							| 
									
										
										
										
											2024-01-22 20:07:11 +08:00
										 |  |  | 		CurrentReason:     "abc", | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | } |